User:Talkindexbot/source

维基百科,自由的百科全书
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
talkindex.py v2.19 by [[zh:user:Shizhao]]



"""
#
# (C) Shizhao, 2008
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: talkindex.py,v 2.16 2008-03-28 Shizhao $'
#
import os
import sys
import urllib
import re, time, datetime
import wikipedia, config, cosmetic_changes, catlib
import xml.parsers.expat 
site = wikipedia.getSite()
FAcat=catlib.Category(site, u'Category:特色条目讨论')
FAcats=FAcat.articlesList()
GAcat=catlib.Category(site, u'Category:优良条目讨论')
GAcats=GAcat.articlesList()
FALcat=catlib.Category(site, u'Category:特色列表讨论')
FALcats=FALcat.articlesList()

def GetPage(rcstart,rcend, rcns):
    #"""从最近更改API获取XML正文"""
    baseurl = 'http://zh.100ke.info/w/api.php?action=query&list=recentchanges&rcstart=%s&rcend=%s&rcnamespace=%s&rcprop=title|user|timestamp|comment&rclimit=500&format=xml'

    uo=wikipedia.MyURLopener()
    url = baseurl % (rcstart, rcend, rcns)
    u = uo.open(url)
    wikipedia.output(u'URL:\n%s' % url)
    return u.read()

tempList = []
titleList=[]
talkList=[]
tlist=[]
tulist=[]
talkformat='''/* (.*?) \*/'''
n=0
template='''* {{talkindex|title=%s|user=%s|talk=%s|time=%s|indiscussion=%s|hot=%s|fg=%s|nomain=%s|disambig=%s|redirect=%s|empty=%s|protect=%s|nowcommons=%s|talkprotect=%s}}\n'''
#分析xml数据,得到所有内容的列表
def listelement(name, attrs):
    if   name   ==  'rc':
        try:
            title=attrs[u'''title''']
            user=attrs[u'''user''']
            talk=attrs[u'''comment''']
            timestamp=attrs[u'''timestamp''']
            try:
#排除重复项目
                talk=re.search(talkformat, talk).groups()[0]
                t=title+'#'+talk
                tu=t+user                
                tlist.append(t)
                tulist.append(tu)
            except AttributeError:
                return
        except KeyError:
            return
    else:
        return	

def remove_dups(lst):
    """ Removes duplicate elements from list. Drawbacks:
        - Returns an unsorted list. 
        - Does not work with lists, dicts etc. as list elements.
    """
    dick = {}
    for item in lst:
        dick[item] = None
    return dick.keys()

#得到用户数量与编辑次数
def usersum(t):
    tu1list=[]
    for tt1, tu1 in zip(tlist, tulist):
        x=[elem for elem in tlist if (tlist.count(elem) > 1) and (elem==t)]
        if x.count(tt1)>1:
            tu1list.append(tu1)
    nuser=len(remove_dups(tu1list))
    ntalk=len(tu1list)	
    x=[]
    return nuser,ntalk

#标记热点讨论
def bighot(t, title):
    nedit=usersum(t)
    if nedit[0]>1 and nedit[1]>2:
        hot='yes'
        wikipedia.output('"Hot!" In [[%s]] have %s users (%s edits) Talking......!' % (title, nedit[0], nedit[1]))
    else:
        hot='no'
    return hot

#添加 {{indiscussion}} 模板到对话页
def IndiscussionAuto(t, title, talk, days, i):
    nedit=usersum(t)
    print nedit, i
    if nedit[0]>1 and nedit[1] >5 and i==None:
        
        pg=wikipedia.Page(site,title)
        text    = pg.get()

 #       for level in range(1, 7):
 #           equals = '=' * level
#        text = wikipedia.replaceExcept(text, r'\n' + '==' + ' *(?P<title>[^=]+?) *' + '==' + ' *\r\n', '\n' + '==' + ' \g<title> ' + '==' + '\r\n', ['comment', 'math', 'nowiki', 'pre'])
#        wikipedia.output(text)
        text = re.sub('=='+' '+talk+' '+'=='+'\n', '=='+' '+talk+' '+ '=='+ '\n{{indiscussion|'+str(time.gmtime()[1])+'}}'+'\n',text,re.I)	
        pg.put(text, u'Bot添加 {{indiscussion}} 模板,最近%s天有%s位用户正在讨论“[[%s|%s]]”话题,已经编辑%s次' %(days,nedit[0], t,talk, nedit[1] ))
        wikipedia.output(u'flag {{indiscussion}}: %s' % talk)		
 #       wikipedia.output(text)	
#更新{{CurrentDiscussion}}   模板         
def CurrentDiscussion():
    template = 'indiscussion'
    regex ='%s' % template
  
    s=wikipedia.Page(site, 'Template:%s' % template)
  
    pages = [page for page in s.getReferences(onlyTemplateInclusion=True)]
    t=wikipedia.Page(site, 'Template:CurrentDiscussion')
    c= u"<font color=red>%s</font>项<noinclude>\n----\n参见[[:Category:進行中的討論|進行中的討論]]\n[[category:維基站務模板|C]]\n</noinclude>" % len(pages)
    comment= u"Bot更新: 当前有 %s 项专题讨论" % len(pages)
    wikipedia.output('[[Template:CurrentDiscussion]] update: Current %s Discussion' % len(pages))
    t.put(c, comment, minorEdit=False)            
	
#标注专题讨论
def Indiscussion(t, title,talk,days):
    s=wikipedia.Page(site,title)
    try:
        text=s.get()

        re.search('=* *'+talk+' *=*' + ' *\n*' + '\{\{indiscussion(|)(.*?)\}\}', text, re.I).group(0)
        i='yes'
        wikipedia.output(u'"{{Indiscussion}}" fond in [[%s]]' % title)
    except:
    
        i=None
	
        return
    return i
    
		
#标记特色与优良条目(特色列表)
def FGflag(title,FAcats, GAcats, FALcats):
    """ wikipedia.getCategoryLinks()暂时不工作,只得到空的分类列表。"""

#        plist    = pg.categories()
#    templatelist=pg.templates()
#        print templatelist
    if wikipedia.Page(site, title) in FAcats or wikipedia.Page(site, title) in FALcats:
#    if (templatelist.count(u'特色条目')<>0 or templatelist.count(u'特色列表')) <>0 and templatelist.count('GA')==0:
        fg='FA'
#    elif (templatelist.count(u'特色条目')==0 or templatelist.count(u'特色列表') ==0) and templatelist.count('GA')<>0:
    elif  wikipedia.Page(site, title) in GAcats:
        fg='GA'
#    elif templatelist.count(u'特色条目')<>0  and templatelist.count('GA')<>0:
#        fg='ERROR'
#        wikipedia.output(u'ERROR: Plese fix FA or GA!')	
    else:
        fg=""	

    return fg

        
  
def nontalk(title):
    """对话页所对应的主名字空间状态,包括主名字空间是否存在、是否消歧义、是否重定向、是否空条目(人为破坏)
    是否被保护、图像是否在commons。以及判断对话页是否被保护
    """
    pg=wikipedia.Page(site,title)
    nontalkpage=pg.toggleTalkPage()
    stuts={u'nomain':'',u'disambig':'',u'redirect':'',u'empty':'',u'protect':'',u'nomain':'',u'nowcommons':'',u'talkprotect':''}
    if nontalkpage.exists():
        stuts[u'nomain']=''
        if nontalkpage.isDisambig():
            stuts[u'disambig']='yes'
            wikipedia.output(u'[[%s]] is Disambig page!' % nontalkpage.title())
        else:
            stuts[u'disambig']=''
        if nontalkpage.isRedirectPage():
            stuts[u'redirect']='yes'
            wikipedia.output(u'[[%s]] is Redirect Page!' % nontalkpage.title())
        else:
            stuts[u'redirect']=''
        if nontalkpage.isEmpty() and not nontalkpage.isRedirectPage():
            stuts[u'empty']='yes'
            wikipedia.output(u'WARING: [[%s]] is Empty!!!' % nontalkpage.title())
        else:
            stuts[u'empty']=''
        if nontalkpage.canBeEdited():
            stuts[u'protect']=''
        else:
            stuts[u'protect']='yes'
            wikipedia.output(u'[[%s]] is protected!!!' % nontalkpage.title())
    else:
        stuts[u'nomain']='none'
        wikipedia.output(u'[[%s]] is Not exist!!!' % nontalkpage.title())
         
    if nontalkpage.isImage():
        imagepage=wikipedia.ImagePage(site, nontalkpage.title())
        if imagepage.fileIsOnCommons():
            stuts[u'nowcommons']='yes'
        else:
            stuts[u'nowcommons']=''
        templatelist=nontalkpage.templates()
        if templatelist.count(u'FeaturedPicture') <>0:
            fg='FA'
        else:
            fg=''

   
    if pg.canBeEdited():
        stuts[u'talkprotect']=''
    else:
        stuts[u'talkprotect']='yes'
        wikipedia.output(u'[[%s]] is Protected!!!' % title)
        
    return stuts
        
	
def start(name, attrs):
    temp=''''''
    ns = ''
    title = ''''''
    user = ''''''
    timestamp=''
    talk=''''''
    global n
    if   name   ==  'rc':
        try:
            title=attrs[u'''title''']
            user=attrs[u'''user''']
            talk=attrs[u'''comment''']
            timestamp=attrs[u'''timestamp''']
            ns=attrs[u'''ns''']
            try:
#排除重复项目
                talk=re.search(talkformat, talk).groups()[0]
                t=title+'#'+talk
                tu=t+user
                if (titleList.count(title)==0 and talkList.count(talk)>0) or (titleList.count(title)>0 and talkList.count(talk)==0) or (titleList.count(title)==0 and talkList.count(talk)==0):
                    if wikipedia.Page(site,title).isRedirectPage():
                        wikipedia.output(u'%s is Redirect Page.' % title)
                    else:
                        titleList.append(title)

                        talkList.append(talk)
#                        print talkList, titleList
#   扩展部分,标注某些项目    ------------------------------------------------------ 
                        hot=bighot(t, title)
                        
                        
                        stuts=nontalk(title)
                        if ns=='1':

                            fg=FGflag(title,FAcats, GAcats, FALcats)
                        else:
                            fg=''
                        i=Indiscussion(t,title,talk,days)
                        badpage=wikipedia.Page(site,u'User:Talkindexbot/blist')
                        if wikipedia.Page(site,title) not in badpage.linkedPages():
                            IndiscussionAuto(t, title, talk, days, i)
                        else:
                           wikipedia.output(u'%s in black list' % title)
	
                        
 #                       print stuts
                        temp = template % (title, user, talk, timestamp, i, hot, fg, stuts[u'nomain'], stuts[u'disambig'], stuts[u'redirect'],stuts[u'empty'], stuts[u'protect'], stuts[u'nowcommons'], stuts[u'talkprotect'])
#                        print 'HI', temp

                        tempList.append(temp)
                        n=n+1

            except AttributeError:
                return
        except KeyError:
            return
    else:
        return	

#得到xml数据	
def Parsexml(html, start_element):
    p = xml.parsers.expat.ParserCreate()  
    p.StartElementHandler = start_element  
    p.returns_unicode = True 
    try:
        p.Parse(html) 
    except xml.parsers.expat.ExpatError:
        return
		
def run():
    """分析页面,提取所有有用信息"""
    while True:

#格式化日期
        rcstart = time.strftime("%Y%m%d%H%M%S",time.gmtime())
        y=time.strftime("%Y",time.gmtime())
        m=time.strftime("%m",time.gmtime())
        d=time.strftime("%d",time.gmtime())
        h=time.strftime("%H",time.gmtime())
        min=time.strftime("%M",time.gmtime())
        s=time.strftime("%S",time.gmtime())
        end=datetime.datetime(int(y),int(m),int(d),int(h),int(min),int(s)) 
        end = end - datetime.timedelta(days=days) 
        rcend = end.strftime("%Y%m%d%H%M%S")
#根据名字空间建立不同页面
        ns={'1':u'Wikipedia:对话页讨论索引/条目','5':u'Wikipedia:对话页讨论索引/wikipedia','7':u'Wikipedia:对话页讨论索引/图像','9':u'Wikipedia:对话页讨论索引/mediawiki','11':u'Wikipedia:对话页讨论索引/模板','13':u'Wikipedia:对话页讨论索引/帮助','15':u'Wikipedia:对话页讨论索引/分类','101':u'Wikipedia:对话页讨论索引/主题'}
        for rcns, wiki in ns.items():
        
        
            html = GetPage(rcstart,rcend, rcns)

            Parsexml(html, listelement)
            Parsexml(html, start)

            global n, tempList
            if rcns== '1':
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科条目对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在条目对话页上的讨论 ==\n''' % days
            elif rcns == '5':
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科Wikipedia(项目、方针等页面)对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在Wikipedia对话页上的讨论 ==\n''' % days
            elif rcns == '7':
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科图像对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在图像对话页上的讨论 ==\n''' % days
            elif rcns == '9':        
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科Mediawiki(系统界面)对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在Mediawiki对话页上的讨论 ==\n''' % days
            elif rcns == '11':
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科模板对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在模板对话页上的讨论 ==\n''' % days
            elif rcns == '13':
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科帮助对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在帮助对话页上的讨论 ==\n''' % days
            elif rcns == '15':
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科分类对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在分类对话页上的讨论 ==\n''' % days
            elif rcns == '101':
                basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科主题(Portal)对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在主题对话页上的讨论 ==\n''' % days
            
            
            wikipedia.output(u'Namespace: %s: Total %s talk in %s days' % (rcns, n, days))
            Lists = "".join(tempList)
            tempList=[]
            if Lists == "":
                basewiki=basewiki+u'当前没有活跃的讨论。'
                comment=u"Bot更新讨论索引:最近%s天,Namespace %s 上没有活跃的讨论" % (days, rcns)
            else:
                basewiki=basewiki+(u'当前共有%s项讨论。最后更新于~~~~~\n' % n)+ Lists
                comment=u"Bot更新讨论索引:最近%s天内,Namespace %s 上共有%s项讨论" % (days, rcns, n)
            pg=wikipedia.Page(site,wiki)

            pg.put(basewiki, comment, minorEdit=False)
            n=0
        CurrentDiscussion()

        hours=4
        now = time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime())
        wikipedia.output(u'\nDone.')
        wikipedia.stopme()
        print '\nSleeping %s hours, now %s' % (hours, now)
        time.sleep(hours *60 *60)
#X天范围内的讨论索引
days=7
#run
try:
    run()
finally:
    wikipedia.stopme()

--百無一用是書生 () 2008年3月5日 (三) 12:06 (UTC)