3 # ljdump.py - livejournal archiver
4 # Greg Hewgill <greg@hewgill.com> http://hewgill.com
9 # This software is provided 'as-is', without any express or implied
10 # warranty. In no event will the author be held liable for any damages
11 # arising from the use of this software.
13 # Permission is granted to anyone to use this software for any purpose,
14 # including commercial applications, and to alter it and redistribute it
15 # freely, subject to the following restrictions:
17 # 1. The origin of this software must not be misrepresented; you must not
18 # claim that you wrote the original software. If you use this software
19 # in a product, an acknowledgment in the product documentation would be
20 # appreciated but is not required.
21 # 2. Altered source versions must be plainly marked as such, and must not be
22 # misrepresented as being the original software.
23 # 3. This notice may not be removed or altered from any source distribution.
25 # Copyright (c) 2005-2010 Greg Hewgill and contributors
27 import codecs, os, pickle, pprint, re, shutil, sys, urllib2, xml.dom.minidom, xmlrpclib
29 from xml.sax import saxutils
38 from hashlib import md5
43 def calcchallenge(challenge, password):
44 return md5(challenge+md5(password).hexdigest()).hexdigest()
46 def flatresponse(response):
49 name = response.readline()
53 name = name[:len(name)-1]
54 value = response.readline()
56 value = value[:len(value)-1]
60 def getljsession(server, username, password):
61 r = urllib2.urlopen(server+"/interface/flat", "mode=getchallenge")
62 response = flatresponse(r)
64 r = urllib2.urlopen(server+"/interface/flat", "mode=sessiongenerate&user=%s&auth_method=challenge&auth_challenge=%s&auth_response=%s" % (username, response['challenge'], calcchallenge(response['challenge'], password)))
65 response = flatresponse(r)
67 return response['ljsession']
69 def dochallenge(server, params, password):
70 challenge = server.LJ.XMLRPC.getchallenge()
72 'auth_method': "challenge",
73 'auth_challenge': challenge['challenge'],
74 'auth_response': calcchallenge(challenge['challenge'], password)
78 def dumpelement(f, name, e):
79 f.write("<%s>\n" % name)
81 if isinstance(e[k], {}.__class__):
82 dumpelement(f, k, e[k])
85 s = unicode(str(e[k]), "UTF-8")
86 except UnicodeDecodeError:
87 # fall back to Latin-1 for old entries that aren't UTF-8
88 s = unicode(str(e[k]), "cp1252")
89 f.write("<%s>%s</%s>\n" % (k, saxutils.escape(s), k))
90 f.write("</%s>\n" % name)
92 def writedump(fn, event):
93 f = codecs.open(fn, "w", "UTF-8")
94 f.write("""<?xml version="1.0"?>\n""")
95 dumpelement(f, "event", event)
98 def writelast(journal, lastsync, lastmaxid):
99 f = open("%s/.last" % journal, "w")
100 f.write("%s\n" % lastsync)
101 f.write("%s\n" % lastmaxid)
104 def createxml(doc, name, map):
105 e = doc.createElement(name)
107 me = doc.createElement(k)
108 me.appendChild(doc.createTextNode(map[k]))
115 return e[0].firstChild.nodeValue
117 def ljdump(Server, Username, Password, Journal):
118 m = re.search("(.*)/interface/xmlrpc", Server)
121 if Username != Journal:
122 authas = "&authas=%s" % Journal
126 print "Fetching journal entries for: %s" % Journal
129 print "Created subdirectory: %s" % Journal
133 ljsession = getljsession(Server, Username, Password)
135 server = xmlrpclib.ServerProxy(Server+"/interface/xmlrpc")
144 f = open("%s/.last" % Journal, "r")
145 lastsync = f.readline()
146 if lastsync[-1] == '\n':
147 lastsync = lastsync[:len(lastsync)-1]
148 lastmaxid = f.readline()
149 if len(lastmaxid) > 0 and lastmaxid[-1] == '\n':
150 lastmaxid = lastmaxid[:len(lastmaxid)-1]
154 lastmaxid = int(lastmaxid)
158 origlastsync = lastsync
160 r = server.LJ.XMLRPC.login(dochallenge(server, {
161 'username': Username,
166 userpics = dict(zip(map(str, r['pickws']), r['pickwurls']))
167 if r['defaultpicurl']:
168 userpics['*'] = r['defaultpicurl']
172 r = server.LJ.XMLRPC.syncitems(dochallenge(server, {
173 'username': Username,
175 'lastsync': lastsync,
176 'usejournal': Journal,
179 if len(r['syncitems']) == 0:
181 for item in r['syncitems']:
182 if item['item'][0] == 'L':
183 print "Fetching journal entry %s (%s)" % (item['item'], item['action'])
186 e = server.LJ.XMLRPC.getevents(dochallenge(server, {
187 'username': Username,
190 'itemid': item['item'][2:],
191 'usejournal': Journal,
194 writedump("%s/%s" % (Journal, item['item']), e['events'][0])
197 print "Unexpected empty item: %s" % item['item']
199 except xmlrpclib.Fault, x:
200 print "Error getting item: %s" % item['item']
203 if str(x).find("will be able to continue posting within an hour."):
204 print "Waiting a hour"
207 lastsync = item['time']
208 writelast(Journal, lastsync, lastmaxid)
210 # The following code doesn't work because the server rejects our repeated calls.
211 # http://www.livejournal.com/doc/server/ljp.csp.xml-rpc.getevents.html
212 # contains the statement "You should use the syncitems selecttype in
213 # conjuntions [sic] with the syncitems protocol mode", but provides
214 # no other explanation about how these two function calls should
215 # interact. Therefore we just do the above slow one-at-a-time method.
218 # r = server.LJ.XMLRPC.getevents(dochallenge(server, {
219 # 'username': Username,
221 # 'selecttype': "syncitems",
222 # 'lastsync': lastsync,
225 # if len(r['events']) == 0:
227 # for item in r['events']:
228 # writedump("%s/L-%d" % (Journal, item['itemid']), item)
230 # lastsync = item['eventtime']
232 print "Fetching journal comments for: %s" % Journal
235 f = open("%s/comment.meta" % Journal)
236 metacache = pickle.load(f)
242 f = open("%s/user.map" % Journal)
243 usermap = pickle.load(f)
253 r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_meta&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
254 meta = xml.dom.minidom.parse(r)
256 print "*** Error fetching comment meta, possibly not community maintainer?"
263 except AttributeError: # r is sometimes a dict for unknown reasons
265 nxid=meta.getElementsByTagName("nextid")
267 nxid = nxid[0].firstChild.nodeValue
270 print "Got meta data maxid = %d nextid=%s"%(
271 int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue),
274 for c in meta.getElementsByTagName("comment"):
275 id = int(c.getAttribute("id"))
277 'posterid': c.getAttribute("posterid"),
278 'state': c.getAttribute("state"),
282 for u in meta.getElementsByTagName("usermap"):
283 usermap[u.getAttribute("id")] = u.getAttribute("user")
284 if maxid >= int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue):
287 f = open("%s/comment.meta" % Journal, "w")
288 pickle.dump(metacache, f)
291 f = open("%s/user.map" % Journal, "w")
292 pickle.dump(usermap, f)
300 r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_body&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
301 meta = xml.dom.minidom.parse(r)
303 print "*** Error fetching comment body, possibly not community maintainer?"
304 print "*** requested id %d "%(maxid+1)
310 for c in meta.getElementsByTagName("comment"):
311 id = int(c.getAttribute("id"))
312 jitemid = c.getAttribute("jitemid")
315 'parentid': c.getAttribute("parentid"),
316 'subject': gettext(c.getElementsByTagName("subject")),
317 'date': gettext(c.getElementsByTagName("date")),
318 'body': gettext(c.getElementsByTagName("body")),
319 'state': metacache[id]['state'],
321 if usermap.has_key(c.getAttribute("posterid")):
322 comment["user"] = usermap[c.getAttribute("posterid")]
324 entry = xml.dom.minidom.parse("%s/C-%s" % (Journal, jitemid))
326 entry = xml.dom.minidom.getDOMImplementation().createDocument(None, "comments", None)
328 for d in entry.getElementsByTagName("comment"):
329 if int(d.getElementsByTagName("id")[0].firstChild.nodeValue) == id:
333 print "Warning: downloaded duplicate comment id %d in jitemid %s" % (id, jitemid)
335 entry.documentElement.appendChild(createxml(entry, "comment", comment))
336 f = codecs.open("%s/C-%s" % (Journal, jitemid), "w", "UTF-8")
342 if maxid >= newmaxid:
347 writelast(Journal, lastsync, lastmaxid)
349 if Username == Journal:
350 print "Fetching userpics for: %s" % Username
351 f = open("%s/userpics.xml" % Username, "w")
352 print >>f, """<?xml version="1.0"?>"""
353 print >>f, "<userpics>"
355 print >>f, """<userpic keyword="%s" url="%s" />""" % (p, userpics[p])
356 pic = urllib2.urlopen(userpics[p])
357 ext = MimeExtensions.get(pic.info()["Content-Type"], "")
358 picfn = re.sub(r'[*?\\/:<>"|]', "_", p)
360 picfn = codecs.utf_8_decode(picfn)[0]
361 picf = open("%s/%s%s" % (Username, picfn, ext), "wb")
363 # for installations where the above utf_8_decode doesn't work
364 picfn = "".join([ord(x) < 128 and x or "_" for x in picfn])
365 picf = open("%s/%s%s" % (Username, picfn, ext), "wb")
366 shutil.copyfileobj(pic, picf)
369 print >>f, "</userpics>"
373 print "%d new entries, %d new comments (since %s)" % (newentries, newcomments, origlastsync)
375 print "%d new entries, %d new comments" % (newentries, newcomments)
377 print "%d errors" % errors
379 if __name__ == "__main__":
380 if os.access("ljdump.config", os.F_OK):
381 config = xml.dom.minidom.parse("ljdump.config")
382 server = config.documentElement.getElementsByTagName("server")[0].childNodes[0].data
383 username = config.documentElement.getElementsByTagName("username")[0].childNodes[0].data
384 password = config.documentElement.getElementsByTagName("password")[0].childNodes[0].data
385 journals = config.documentElement.getElementsByTagName("journal")
388 ljdump(server, username, password, e.childNodes[0].data)
390 ljdump(server, username, password, username)
392 from getpass import getpass
393 print "ljdump - livejournal archiver"
395 print "Enter your Livejournal username and password."
397 server = "http://livejournal.com"
398 username = raw_input("Username: ")
399 password = getpass("Password: ")
401 print "You may back up either your own journal, or a community."
402 print "If you are a community maintainer, you can back up both entries and comments."
403 print "If you are not a maintainer, you can back up only entries."
405 journal = raw_input("Journal to back up (or hit return to back up '%s'): " % username)
408 ljdump(server, username, password, journal)
410 ljdump(server, username, password, username)