#!/usr/bin/python # -*- encoding: utf-8 -*- """ Formats two files, downloaded by ljdump.py into HTML page using template. Syntax ljformatxml -t template -o file.html L-nnnn C-nnnn """ from ConfigParser import ConfigParser import xml.dom.minidom, xml.dom import re import sys,os,glob import codecs # Параметры конфигурации # Директория для симлинков # url картинки с человечком # Директория с результатами дампа # директория для картинки # шаблон для страницы поста # шаблон для блока комментария template={} urls={} dirs={} def read_templates(config): global template for i in ['post','comment']: with codecs.open(config.get('templates',i),'r','utf-8') as f: template[i]=f.read() def set_parameters(config): global urls,dirs for i in config.options('urls'): urls[i]=config.get('urls',i) for i in config.options('directories'): dirs[i]=config.get('directories',i) def process_ljtag(m): """ Receives lj tag match object with lj tag and returns html text which should be used as replacement Used to pass into re.sub """ tag = m.group(0) if tag.find("lj-cut")!=-1: return "" if tag.find('user=')!=-1: name= re.search('user=[\'\"]?(\w+)[\'\"]?',tag).group(1) title = re.search('title=[\"\']?([^"\'>]+)[\'\"]?',tag) if title: title = title.group(1) else: title=name tag= '%s'%(name,urls['icons']+"/userinfo.gif",title) return tag print "unknown lj tag: ",tag def process_text(text): # Выделить оттуда текст, распарсить как html, заменяя lj-тэги try: text = re.sub("]+>",process_ljtag,text) except Exception as e: print 'bad text :',text raise e text = re.sub("\r?\n","
",text) # и заменяя img на локальные копии, если они есть. Если нет, писать в # кеш картинок # FIXME post_props[post_text] = re.sub("]+>",process_img.post_text) return text def format_comments(cmt_list): out=[] for cmt in cmt_list: if len(cmt['children']): cmt['comments']=format_comments(cmt['children']) else: cmt['comments']='' if 'user' in cmt: cmt['userlink']=process_text(''%cmt['user']) out.append(template['comment'] % cmt) return ''.join(out) def do_post(postfile,commentfile,outputfile): """ Handles one post. Returns post date, url, subject and tag list """ # Прочитать L-nnnn post_xml = xml.dom.minidom.parse(postfile) post_props = {'subject':'','taglist':''} for n in post_xml.documentElement.childNodes: if n.nodeType == xml.dom.Node.ELEMENT_NODE: if n.nodeName == u'event': post_props['text']=process_text(n.firstChild.nodeValue) elif n.nodeName == u'props': # Выделить необходимую метаинформацию for n2 in n.childNodes: if n2.nodeType == xml.dom.Node.ELEMENT_NODE: post_props[str(n2.nodeName)] = n2.firstChild.nodeValue else: post_props[str(n.nodeName)] = n.firstChild.nodeValue if not 'text' in post_props: raise ValueError("No event node in ths post") if 'picture_keyword' in post_props: userpic=post_props['picture_keyword'] else: userpic='_' for fmt in ('jpg','gif','png'): if os.access("%s/%s.%s" % (dirs['archive'],userpic,fmt),os.R_OK): post_props['userpic']='%s/userpics/%s.%s'%(urls['images'],userpic,fmt) break if commentfile: comment_xml = xml.dom.minidom.parse( commentfile) # We suppose that comments are already sorted accoridng to post time comment_tree = [] comment_hash = {} comment_count = 0 for c in comment_xml.documentElement.childNodes: if c.nodeType != xml.dom.Node.ELEMENT_NODE or c.nodeName != 'comment': continue comment={'date':'Unknown','children':[],'subject':'','userlink':'(Anonymous)'} for i in c.childNodes: if i.nodeType != xml.dom.Node.ELEMENT_NODE: continue if i.nodeName == 'body': if i.firstChild is None: comment['body']='Deleted comment' else: comment['body']=process_text(i.firstChild.nodeValue) else: tx=i.firstChild if tx: comment[str(i.nodeName)]=tx.nodeValue comment_hash[comment['id']]=comment if 'parentid' in comment and comment['parentid'] in comment_hash: comment_hash[comment['parentid']]['children'].append(comment) comment_count +=1 else: comment_tree.append(comment) post_props['comments'] = format_comments(comment_tree) post_props['comment_count'] = comment_count else: post_props['comments'] = '' post_props['comment_count'] = 0 page = template['post']%post_props with codecs.open(outputfile,"w","utf-8") as f : f.write(page) return (post_props['logtime'],post_props['ditemid'],post_props['subject'],post_props['taglist']) if __name__ == '__main__': config=ConfigParser() if config.read(["ljmkstatic.conf"]) < 1: raise ValueError("No config file found") read_templates(config) set_parameters(config) for post_file in sorted(glob.glob(dirs['dump']+"/L-*")): post_id = re.search("(\d+)$",post_file).group(1) comment_file = dirs['dump']+"/C-"+post_id outfile=dirs['dump']+"/"+post_id+".html" try: t1=os.stat(post_file).st_mtime try: t2=os.stat(comment_file).st_mtime except OSError: t2=0 comment_file = None t3=os.stat(outfile).st_mtime if t3 > t1 and t3 > t2: continue except OSError: pass print "Processing post L-%s"%post_id (date,post_id,subject,tags) = do_post(post_file,comment_file,outfile) # Fix me - update index structures