convertdump.py

   1 #!/usr/bin/python
   2
   3 # Copyright 2009, Sean M. Graham (www.sean-graham.com)
   4 # All rights reserved.
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are
   8 # met:
   9 #
  10 # - Redistributions of source code must retain the above copyright notice,
  11 #   this list of conditions and the following disclaimer.
  12 #
  13 # - Redistributions in binary form must reproduce the above copyright notice,
  14 #   this list of conditions and the following disclaimer in the documentation
  15 #   and/or other materials provided with the distribution.
  16 #
  17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18 # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19 # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  20 # EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  21 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  23 # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  26 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27
  28 import xml.dom.minidom
  29 import os
  30 import codecs
  31 import sys
  32
  33 from time import strptime, strftime
  34
  35 def getNodeText(doc, nodename):
  36     rc = ""
  37
  38     try:
  39         nodelist = doc.getElementsByTagName(nodename)[0].childNodes
  40     except:
  41         return ""
  42
  43     for node in nodelist:
  44         if node.nodeType == node.TEXT_NODE:
  45             rc = rc + node.data
  46
  47     return rc
  48
  49 def appendTextNode(doc, parent, nodename, value):
  50     nodeValue = value
  51
  52     # make sure value is properly encoded
  53     try:
  54         bytes = nodeValue.encode("UTF-8")
  55     except:
  56         bytes = nodeValue.encode("cp1252")
  57         nodeValue = unicode(bytes, "UTF-8")
  58
  59     element = doc.createElement(nodename)
  60
  61     if( nodeValue != "" ):
  62         textNode = doc.createTextNode(nodeValue)
  63         element.appendChild(textNode)
  64
  65     parent.appendChild(element)
  66
  67
  68 def addEntryForId(outDoc, element, username, id):
  69     entryFile = open("%s/L-%s" % (username,id), "r")
  70     inDoc = xml.dom.minidom.parse(entryFile)
  71
  72     # Create an entry element
  73     entry = outDoc.createElement("entry")
  74     element.appendChild(entry)
  75
  76     # Create an itemid element
  77     appendTextNode(outDoc, entry, "itemid", getNodeText(inDoc,"itemid"))
  78
  79     # Create an eventtime element
  80     appendTextNode(outDoc, entry, "eventtime", getNodeText(inDoc, "eventtime"))
  81
  82     # Create an subject element
  83     appendTextNode(outDoc, entry, "subject", getNodeText(inDoc, "subject"))
  84
  85     # Create an event node (special case because for some reason there are two
  86     # 'event' elements in the pydump output, which is probably LJ's fault)
  87     event = inDoc.getElementsByTagName("event")[0]
  88     appendTextNode(outDoc, entry, "event", getNodeText(event, "event"))
  89
  90     # Create an allowmask element (doesn't exist in pydump output if public)
  91     maskText = getNodeText(inDoc, "allowmask")
  92
  93     # XXXSMG: consult L-1411 and L-976 for examples of security and
  94     # allowmask use
  95     if(maskText != ""):
  96         appendTextNode(outDoc, entry, "allowmask", maskText)
  97     else:
  98         appendTextNode(outDoc, entry, "allowmask", "0")
  99
 100     # Create a taglist element
 101     appendTextNode(outDoc, entry, "taglist", getNodeText(inDoc, "taglist"))
 102
 103     # XXXSMG: make sure there is a comment file before trying to do anything
 104     # with it
 105     addCommentsForId(outDoc, entry, username, id)
 106
 107 def addCommentsForId(outDoc, entry, username, id):
 108     try:
 109         commentFile = open("%s/C-%s" % (username,id), "r")
 110     except IOError:  # there are no comments for this entry
 111         return
 112
 113     inDoc = xml.dom.minidom.parse(commentFile)
 114
 115     comments = inDoc.getElementsByTagName("comment")
 116
 117     for comment in comments:
 118         outComment = outDoc.createElement("comment")
 119         entry.appendChild(outComment)
 120
 121         # add the item id for the comment
 122         appendTextNode(outDoc, outComment, "itemid",
 123             getNodeText(comment, "id"))
 124
 125         # convert the time string
 126         timeString = getNodeText(comment, "date")
 127         if( timeString != "" ):
 128             inDate = strptime(timeString, "%Y-%m-%dT%H:%M:%SZ")
 129             outDate = strftime("%Y-%m-%d %H:%M:%S", inDate)
 130             appendTextNode(outDoc, outComment, "eventtime", outDate)
 131         else:
 132             emptyTime = outDoc.createElement("eventtime")
 133             outComment.appendChild(emptyTime)
 134
 135         # Create an subject element
 136         appendTextNode(outDoc, outComment, "subject",
 137             getNodeText(comment, "subject"))
 138
 139         # Create an event element
 140         appendTextNode(outDoc, outComment, "event",
 141             getNodeText(comment, "body"))
 142
 143         # Create the author element
 144         author = outDoc.createElement("author")
 145         outComment.appendChild(author)
 146
 147         try:
 148             cUser = getNodeText(comment, "user")
 149         except:
 150             cUser = "anonymous"
 151
 152         appendTextNode(outDoc, author, "name", cUser)
 153         appendTextNode(outDoc, author, "email", cUser + "@livejournal.com")
 154
 155         # Create the parent_itemid
 156         parentId = getNodeText(comment, "parentid")
 157         if(parentId != ""):
 158             appendTextNode(outDoc, outComment, "parent_itemid", parentId)
 159
 160 def main(argv):
 161     username = ""
 162     entryLimit = 250
 163
 164
 165     if( len(argv) != 2 ):
 166         print( "Usage: convertdump.py <username> <entrylimit>" )
 167         return
 168     else:
 169         username = argv[0]
 170         entryLimit = int(argv[1])
 171
 172     userDir = os.listdir(username)
 173
 174     highNum = -1
 175     entryArray = []
 176
 177     # get the list of entries
 178     for file in userDir:
 179         if file.startswith("L-"):
 180             entryNum = int(file.replace("L-",""))
 181
 182             entryArray.append(entryNum)
 183
 184             if( highNum < entryNum ):
 185                 highNum = entryNum
 186
 187     entryArray.sort()
 188
 189
 190     # Create the minidom document
 191     outDoc = xml.dom.minidom.Document()
 192
 193     # Create the <livejournal> base element
 194     ljElement = outDoc.createElement("livejournal")
 195     outDoc.appendChild(ljElement)
 196
 197     currentFileEntry = 0
 198
 199     # start processing entries
 200     for entry in entryArray:
 201         addEntryForId(outDoc, ljElement, username, entry)
 202
 203         currentFileEntry += 1
 204
 205         if( currentFileEntry == entryLimit or entry == entryArray[-1] ):
 206
 207             f = open("%s - %s.xml" % (username, entry), "w")
 208             tempXML = outDoc.toxml("UTF-8")
 209             f.write(tempXML)
 210
 211             currentFileEntry = 0
 212
 213             # Create the minidom document
 214             outDoc = xml.dom.minidom.Document()
 215
 216             # Create the <livejournal> base element
 217             ljElement = outDoc.createElement("livejournal")
 218             outDoc.appendChild(ljElement)
 219
 220 if __name__ == "__main__":
 221     main(sys.argv[1:])
 222