Add some 0.2s delays go conform to new bot policy. Correctly handle disallowing for...

[oss/ljdump.git] / convertdump.py
diff --git a/convertdump.py b/convertdump.py

index f098a58f2a2e0b29fd0fbaca86ceb3a5a7e75509..0f59e8282d6dc9e2e9fa481a1312201b6468c2bf 100755 (executable)
--- a/convertdump.py
+++ b/convertdump.py
@@ -30,6 +30,7 @@ import os
  import codecs
  import sys
  import getopt
+import re
  
  from time import strptime, strftime
  
@@ -85,7 +86,9 @@ def addEntryForId(outDoc, element, username, id, includeSecure):
      # Create an event node (special case because for some reason there are two
      # 'event' elements in the pydump output, which is probably LJ's fault)
      event = inDoc.getElementsByTagName("event")[0]
-    appendTextNode(outDoc, entry, "event", getNodeText(event, "event"))
+    eventText = getNodeText(event, "event")
+
+    appendTextNode(outDoc, entry, "event", replaceLJTags(eventText))
  
      security = getNodeText(inDoc, "security")
  
@@ -152,8 +155,8 @@ def addCommentsForId(outDoc, entry, username, id):
              getNodeText(comment, "subject"))
  
          # Create an event element
-        appendTextNode(outDoc, outComment, "event", 
-            getNodeText(comment, "body"))
+        bodyText = getNodeText(comment, "body")
+        appendTextNode(outDoc, outComment, "event", replaceLJTags(bodyText))
  
          # Create the author element
          author = outDoc.createElement("author")
@@ -172,6 +175,41 @@ def addCommentsForId(outDoc, entry, username, id):
          if(parentId != ""): 
              appendTextNode(outDoc, outComment, "parent_itemid", parentId)
  
+
+# regular expressions used in replaceLJTags()
+#   (global for later reuse - suggestion by jparise)
+
+userRE = re.compile('<lj user="(.*?)" ?/?>', re.IGNORECASE)
+commRE = re.compile('<lj comm="(.*?)" ?/?>', re.IGNORECASE)
+namedCutRE = re.compile('<lj-cut +text="(.*?)" ?/?>', 
+                        re.IGNORECASE|re.DOTALL)
+cutRE = re.compile('<lj-cut>', re.IGNORECASE)
+cutRE = re.compile('</lj-cut>', re.IGNORECASE)
+embedRE = re.compile('<lj-embed id="[0-9]+">', re.IGNORECASE)
+
+def replaceLJTags(entry):
+    rv = entry
+
+    # replace lj user tags
+    rv = re.sub(userRE, '<a href="http://www.livejournal.com/users/\\1" class="lj-user">\\1</a>', rv) 
+
+    # replace lj comm tags
+    rv = re.sub(commRE, '<a href="http://community.livejournal.com/\\1/" class="lj-comm">\\1</a>', rv) 
+
+    # replace lj-cut tags
+    rv = re.sub(namedCutRE, '<!--more \\1-->', rv)
+    rv = re.sub(cutRE, '<!--more-->', rv)
+    rv = re.sub(cutRE, '', rv)
+
+    # replace lj-embed tags
+    # this doesn't actually work.  LJ doesn't include the embedded content
+    # when ljdump calls 'getevents', but instead includes an lj-embed tag
+    # with an id and nothing else.
+    #rv = re.sub(embedRE, '', rv)
+
+    return rv
+
+
  def usage():
      print( "Usage: convertdump.py [arguments]" )
      print( """