2 # -*- encoding: utf-8 -*-
4 Formats two files, downloaded by ljdump.py into HTML page using
7 Syntax ljformatxml -t template -o file.html L-nnnn C-nnnn
10 from ConfigParser import ConfigParser
11 import xml.dom.minidom, xml.dom
15 # Параметры конфигурации
16 # Директория для симлинков
17 # url картинки с человечком
18 # Директория с результатами дампа
19 # директория для картинки
20 # шаблон для страницы поста
21 # шаблон для блока комментария
25 def read_templates(config):
27 for i in ['post','comment']:
28 with codecs.open(config.get('templates',i),'r','utf-8') as f:
32 def set_parameters(config):
34 for i in config.options('urls'):
35 urls[i]=config.get('urls',i)
36 for i in config.options('directories'):
37 dirs[i]=config.get('directories',i)
41 Receives lj tag match object with lj tag and returns
42 html text which should be used as replacement
43 Used to pass into re.sub
46 if tag.find("lj-cut")!=-1:
48 if tag.find('user=')!=-1:
49 name= re.search('user=[\'\"]?(\w+)[\'\"]?',tag).group(1)
50 title = re.search('title=[\"\']?([^"\'>]+)[\'\"]?',tag)
52 title = title.group(1)
55 tag= '<a style="color: blue; font-weight: bold;" href="http://www.livejournal.com/users/%s/profile"><img src="%s">%s</a>'%(name,urls['icons']+"/userinfo.gif",title)
57 print "unknown lj tag: ",tag
59 def process_text(text):
60 # Выделить оттуда текст, распарсить как html, заменяя lj-тэги
62 text = re.sub("</?lj[^>]+>",process_ljtag,text)
63 except Exception as e:
64 print 'bad text :',text
66 text = re.sub("\r?\n","<br>",text)
67 # и заменяя img на локальные копии, если они есть. Если нет, писать в
69 # FIXME post_props[post_text] = re.sub("<img # [^>]+>",process_img.post_text)
71 def format_comments(cmt_list):
74 if len(cmt['children']):
75 cmt['comments']=format_comments(cmt['children'])
79 cmt['userlink']=process_text('<lj user="%s">'%cmt['user'])
80 out.append(template['comment'] % cmt)
83 def do_post(postfile,commentfile,outputfile):
85 Handles one post. Returns post date, url, subject and tag list
88 post_xml = xml.dom.minidom.parse(postfile)
89 post_props = {'subject':'','taglist':''}
90 for n in post_xml.documentElement.childNodes:
91 if n.nodeType == xml.dom.Node.ELEMENT_NODE:
92 if n.nodeName == u'event':
93 post_props['text']=process_text(n.firstChild.nodeValue)
94 elif n.nodeName == u'props':
95 # Выделить необходимую метаинформацию
96 for n2 in n.childNodes:
97 if n2.nodeType == xml.dom.Node.ELEMENT_NODE:
98 post_props[str(n2.nodeName)] = n2.firstChild.nodeValue
100 post_props[str(n.nodeName)] = n.firstChild.nodeValue
102 if not 'text' in post_props:
103 raise ValueError("No event node in ths post")
104 if 'picture_keyword' in post_props:
105 userpic=post_props['picture_keyword']
108 for fmt in ('jpg','gif','png'):
109 if os.access("%s/%s.%s" % (dirs['archive'],userpic,fmt),os.R_OK):
110 post_props['userpic']='%s/userpics/%s.%s'%(urls['images'],userpic,fmt)
113 comment_xml = xml.dom.minidom.parse( commentfile)
114 # We suppose that comments are already sorted accoridng to post time
118 for c in comment_xml.documentElement.childNodes:
119 if c.nodeType != xml.dom.Node.ELEMENT_NODE or c.nodeName != 'comment':
121 comment={'date':'Unknown','children':[],'subject':'','userlink':'(Anonymous)'}
122 for i in c.childNodes:
123 if i.nodeType != xml.dom.Node.ELEMENT_NODE:
125 if i.nodeName == 'body':
126 if i.firstChild is None:
127 comment['body']='<b>Deleted comment</b>'
129 comment['body']=process_text(i.firstChild.nodeValue)
133 comment[str(i.nodeName)]=tx.nodeValue
134 comment_hash[comment['id']]=comment
135 if 'parentid' in comment and comment['parentid'] in comment_hash:
136 comment_hash[comment['parentid']]['children'].append(comment)
139 comment_tree.append(comment)
141 post_props['comments'] = format_comments(comment_tree)
142 post_props['comment_count'] = comment_count
144 post_props['comments'] = ''
145 post_props['comment_count'] = 0
146 page = template['post']%post_props
148 with codecs.open(outputfile,"w","utf-8") as f :
150 return (post_props['logtime'],post_props['ditemid'],post_props['subject'],post_props['taglist'])
155 if __name__ == '__main__':
156 config=ConfigParser()
157 if config.read(["ljmkstatic.conf"]) < 1:
158 raise ValueError("No config file found")
159 read_templates(config)
160 set_parameters(config)
161 for post_file in sorted(glob.glob(dirs['dump']+"/L-*")):
162 post_id = re.search("(\d+)$",post_file).group(1)
163 comment_file = dirs['dump']+"/C-"+post_id
164 outfile=dirs['dump']+"/"+post_id+".html"
166 t1=os.stat(post_file).st_mtime
168 t2=os.stat(comment_file).st_mtime
172 t3=os.stat(outfile).st_mtime
173 if t3 > t1 and t3 > t2:
177 print "Processing post L-%s"%post_id
178 (date,post_id,subject,tags) = do_post(post_file,comment_file,outfile)
179 # Fix me - update index structures