convertdump.py

   1 #!/usr/bin/python
   2
   3 # Copyright 2009, Sean M. Graham (www.sean-graham.com)
   4 # All rights reserved.
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are
   8 # met:
   9 #
  10 # - Redistributions of source code must retain the above copyright notice,
  11 #   this list of conditions and the following disclaimer.
  12 #
  13 # - Redistributions in binary form must reproduce the above copyright notice,
  14 #   this list of conditions and the following disclaimer in the documentation
  15 #   and/or other materials provided with the distribution.
  16 #
  17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18 # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19 # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  20 # EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  21 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  23 # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  26 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27
  28 import xml.dom.minidom
  29 import os
  30 import codecs
  31 import sys
  32 import getopt
  33 import re
  34
  35 from time import strptime, strftime
  36
  37 def getNodeText(doc, nodename):
  38     rc = ""
  39
  40     try:
  41         nodelist = doc.getElementsByTagName(nodename)[0].childNodes
  42     except:
  43         return ""
  44
  45     for node in nodelist:
  46         if node.nodeType == node.TEXT_NODE:
  47             rc = rc + node.data
  48
  49     return rc
  50
  51 def appendTextNode(doc, parent, nodename, value):
  52     nodeValue = value
  53
  54     # make sure value is properly encoded
  55     try:
  56         bytes = nodeValue.encode("UTF-8")
  57     except:
  58         bytes = nodeValue.encode("cp1252")
  59         nodeValue = unicode(bytes, "UTF-8")
  60
  61     element = doc.createElement(nodename)
  62
  63     if( nodeValue != "" ):
  64         textNode = doc.createTextNode(nodeValue)
  65         element.appendChild(textNode)
  66
  67     parent.appendChild(element)
  68
  69
  70 def addEntryForId(outDoc, element, username, id, includeSecure):
  71     entryFile = open("%s/L-%s" % (username,id), "r")
  72     inDoc = xml.dom.minidom.parse(entryFile)
  73
  74     # Create an entry element
  75     entry = outDoc.createElement("entry")
  76
  77     # Create an itemid element
  78     appendTextNode(outDoc, entry, "itemid", getNodeText(inDoc,"itemid"))
  79
  80     # Create an eventtime element
  81     appendTextNode(outDoc, entry, "eventtime", getNodeText(inDoc, "eventtime"))
  82
  83     # Create an subject element
  84     appendTextNode(outDoc, entry, "subject", getNodeText(inDoc, "subject"))
  85
  86     # Create an event node (special case because for some reason there are two
  87     # 'event' elements in the pydump output, which is probably LJ's fault)
  88     event = inDoc.getElementsByTagName("event")[0]
  89     eventText = getNodeText(event, "event")
  90
  91     appendTextNode(outDoc, entry, "event", replaceLJTags(eventText))
  92
  93     security = getNodeText(inDoc, "security")
  94
  95     if(security != ""):
  96         # don't append this entry unless the user provided the argument
  97         if(includeSecure == False):
  98             print("omitting secure entry: L-%s" % id)
  99             return
 100         else:
 101             if(security == "usemask"):
 102                 print("including allowmask entry: L-%s" % id)
 103
 104                 # Create an allowmask element
 105                 maskText = getNodeText(inDoc, "allowmask")
 106
 107                 if(maskText != ""):
 108                     appendTextNode(outDoc, entry, "allowmask", maskText)
 109                 else:
 110                     appendTextNode(outDoc, entry, "allowmask", "0")
 111             else:
 112                 print("including private entry: L-%s" % id)
 113
 114         appendTextNode(outDoc, entry, "security", security)
 115
 116     # Create a taglist element
 117     appendTextNode(outDoc, entry, "taglist", getNodeText(inDoc, "taglist"))
 118
 119     # XXXSMG: make sure there is a comment file before trying to do anything
 120     # with it
 121     addCommentsForId(outDoc, entry, username, id)
 122
 123     element.appendChild(entry)
 124
 125 def addCommentsForId(outDoc, entry, username, id):
 126     try:
 127         commentFile = open("%s/C-%s" % (username,id), "r")
 128     except IOError:  # there are no comments for this entry
 129         return
 130
 131     inDoc = xml.dom.minidom.parse(commentFile)
 132
 133     comments = inDoc.getElementsByTagName("comment")
 134
 135     for comment in comments:
 136         outComment = outDoc.createElement("comment")
 137         entry.appendChild(outComment)
 138
 139         # add the item id for the comment
 140         appendTextNode(outDoc, outComment, "itemid",
 141             getNodeText(comment, "id"))
 142
 143         # convert the time string
 144         timeString = getNodeText(comment, "date")
 145         if( timeString != "" ):
 146             inDate = strptime(timeString, "%Y-%m-%dT%H:%M:%SZ")
 147             outDate = strftime("%Y-%m-%d %H:%M:%S", inDate)
 148             appendTextNode(outDoc, outComment, "eventtime", outDate)
 149         else:
 150             emptyTime = outDoc.createElement("eventtime")
 151             outComment.appendChild(emptyTime)
 152
 153         # Create an subject element
 154         appendTextNode(outDoc, outComment, "subject",
 155             getNodeText(comment, "subject"))
 156
 157         # Create an event element
 158         bodyText = getNodeText(comment, "body")
 159         appendTextNode(outDoc, outComment, "event", replaceLJTags(bodyText))
 160
 161         # Create the author element
 162         author = outDoc.createElement("author")
 163         outComment.appendChild(author)
 164
 165         try:
 166             cUser = getNodeText(comment, "user")
 167         except:
 168             cUser = "anonymous"
 169
 170         appendTextNode(outDoc, author, "name", cUser)
 171         appendTextNode(outDoc, author, "email", cUser + "@livejournal.com")
 172
 173         # Create the parent_itemid
 174         parentId = getNodeText(comment, "parentid")
 175         if(parentId != ""):
 176             appendTextNode(outDoc, outComment, "parent_itemid", parentId)
 177
 178
 179 # regular expressions used in replaceLJTags()
 180 #   (global for later reuse - suggestion by jparise)
 181
 182 userRE = re.compile('<lj user="(.*?)" ?/?>', re.IGNORECASE)
 183 commRE = re.compile('<lj comm="(.*?)" ?/?>', re.IGNORECASE)
 184 namedCutRE = re.compile('<lj-cut +text="(.*?)" ?/?>',
 185                         re.IGNORECASE|re.DOTALL)
 186 cutRE = re.compile('<lj-cut>', re.IGNORECASE)
 187 cutRE = re.compile('</lj-cut>', re.IGNORECASE)
 188 embedRE = re.compile('<lj-embed id="[0-9]+">', re.IGNORECASE)
 189
 190 def replaceLJTags(entry):
 191     rv = entry
 192
 193     userRE = re.compile('<lj user="(.*?)" ?/?>', re.IGNORECASE)
 194     commRE = re.compile('<lj comm="(.*?)" ?/?>', re.IGNORECASE)
 195     namedCutRE = re.compile('<lj-cut +text="(.*?)" ?/?>',
 196                             re.IGNORECASE|re.DOTALL)
 197     cutRE = re.compile('<lj-cut>', re.IGNORECASE)
 198     cutRE = re.compile('</lj-cut>', re.IGNORECASE)
 199     embedRE = re.compile('<lj-embed id="[0-9]+">', re.IGNORECASE)
 200
 201     # replace lj user tags
 202     rv = re.sub(userRE, '<a href="http://\\1.livejournal.com/" class="lj-user">\\1</a>', rv)
 203
 204     # replace lj comm tags
 205     rv = re.sub(commRE, '<a href="http://community.livejournal.com/\\1/" class="lj-comm">\\1</a>', rv)
 206
 207     # replace lj-cut tags
 208     rv = re.sub(namedCutRE, '<!--more \\1-->', rv)
 209     rv = re.sub(cutRE, '<!--more-->', rv)
 210     rv = re.sub(cutRE, '', rv)
 211
 212     # replace lj-embed tags
 213     rv = re.sub(embedRE, '', rv)
 214
 215     return rv
 216
 217
 218 def usage():
 219     print( "Usage: convertdump.py [arguments]" )
 220     print( """
 221 This will convert a pydump archive into something compatible with the
 222 WordPress LiveJournal importer.  This is the same format used by the Windows
 223 ljArchive exporter.
 224
 225 Arguments:
 226     -u  --user      username of archive to process [required]
 227     -l  --limit     limit the number of entries in each xml file (default 250)
 228     -i  --insecure  include private and protected entries in the output
 229     -h  --help      show this help page
 230
 231 Example:
 232     ./convertdump.py --user stevemartin --limit 200 --insecure
 233 """)
 234
 235
 236 def main(argv):
 237     username = ""
 238     entryLimit = 250
 239     includeSecure = False;
 240
 241     if( len(argv) == 0 ):
 242         usage()
 243         sys.exit(2)
 244
 245     try:
 246         opts, args = getopt.getopt(sys.argv[1:], "hu:l:i", ["help",
 247                                                             "user=",
 248                                                             "limit=",
 249                                                             "insecure"])
 250     except getopt.GetoptError, err:
 251         # print help information and exit:
 252         print str(err) # will print something like "option -a not recognized"
 253         usage()
 254         sys.exit(2)
 255
 256     for o, a in opts:
 257         if o == "-v":
 258             verbose = True
 259         elif o in ("-u", "--user"):
 260             username = a
 261         elif o in ("-l", "--limit"):
 262             entryLimit = int(a)
 263         elif o in ("-i", "--insecure"):
 264             print( "Warning:  Including secure entries in XML output" )
 265             includeSecure = True
 266         elif o in ("-h", "--help"):
 267             usage()
 268             sys.exit()
 269         else:
 270             assert False, "unhandled option"
 271
 272     userDir = os.listdir(username)
 273
 274     highNum = -1
 275     entryArray = []
 276
 277     # get the list of entries
 278     for file in userDir:
 279         if file.startswith("L-"):
 280             entryNum = int(file.replace("L-",""))
 281
 282             entryArray.append(entryNum)
 283
 284             if( highNum < entryNum ):
 285                 highNum = entryNum
 286
 287     entryArray.sort()
 288
 289     # Create the minidom document
 290     outDoc = xml.dom.minidom.Document()
 291
 292     # Create the <livejournal> base element
 293     ljElement = outDoc.createElement("livejournal")
 294     outDoc.appendChild(ljElement)
 295
 296     currentFileEntry = 0
 297
 298     # start processing entries
 299     for entry in entryArray:
 300         addEntryForId(outDoc, ljElement, username, entry, includeSecure)
 301
 302         currentFileEntry += 1
 303
 304         if( currentFileEntry == entryLimit or entry == entryArray[-1] ):
 305
 306             f = open("%s - %s.xml" % (username, entry), "w")
 307             tempXML = outDoc.toxml("UTF-8")
 308             f.write(tempXML)
 309
 310             currentFileEntry = 0
 311
 312             # Create the minidom document
 313             outDoc = xml.dom.minidom.Document()
 314
 315             # Create the <livejournal> base element
 316             ljElement = outDoc.createElement("livejournal")
 317             outDoc.appendChild(ljElement)
 318
 319 if __name__ == "__main__":
 320     main(sys.argv[1:])
 321