"""fc2rss Screen scrape an OU FirstClass conference or two into RSS files. Avoid the appalling FC interface! Stuart Langridge, http://www.kryogenix.org/ v1.0 2003-10-11 """ import urllib,re,urlparse,os,cgi # Required variables OU_USERNAME = 'YOUR_OU_USERNAME' OU_PASSWORD = 'YOUR_OU_PASSWORD' CONFERENCES = [ 'http://oufcnt2.open.ac.uk/Login/0007584D-80000001/', # m874 'http://oufcnt2.open.ac.uk/Login/00075E75-80000001/' # m879 ] RSS_PATH = '/home/httpd/test/ou/rss' debugmode = 0 reTitle = re.compile(r'^\s*<title>(?P<title>[^<]+)</title>\s*$') reItem = re.compile(r'\s*leaf\[\d+\]=new Array\(\d+,\d+,\d+,\d+,"(?P<id>[^"]+)", "(?P<author>[^"]+)"\);\s*$') reDiv = re.compile(r'\s*<div ') reEndDiv = re.compile(r'\s*</div>') reHTML = re.compile('<[^>]+>') ws = re.compile(r'[\x0b\x0c\n\r]') # Handle required authentication class myOpener(urllib.FancyURLopener): def prompt_user_passwd(self,host,realm): return (OU_USERNAME,OU_PASSWORD) urllib._urlopener = myOpener() def parseConf(conf): title = '' items = [] fp = urllib.urlopen(conf) data = ws.split(fp.read()) for line in data: t = reTitle.match(line) if t: title = t.groupdict()['title'] if t and debugmode: print '---%s---' % title i = reItem.match(line) if i: title2 = '' text2 = '' capture = 0 iid = i.groupdict()['id'] iauth = i.groupdict()['author'] itemuri = urlparse.urljoin(conf,iid) fp2 = urllib.urlopen(itemuri) data2 = ws.split(fp2.read()) for line2 in data2: t2 = reTitle.match(line2) if t2: title2 = t2.groupdict()['title'] if t2 and debugmode: print ' %s' % title2 divs = reDiv.match(line2) if divs: capture = 1 if capture: text2 += line2 dive = reEndDiv.search(line2) if dive: capture = 0 fp2.close() text2 = re.sub('<br>','\n',text2) text2 = reHTML.sub('',text2) text2 = re.sub('\n','<br>',text2) text2 = re.sub('\x00','',text2) items.append((iid,iauth,itemuri,title2,text2)) fp.close() return title,items # Walk the conferences confdata = {} for conf in CONFERENCES: title,items = parseConf(conf) if title and items: confdata[title] = (conf,items) for conf in confdata.keys(): fp = open(os.path.join(RSS_PATH,conf.replace(' ','_')) + '.xml','w') url,items = confdata[conf] fp.write('''<rss version="2.0"> <channel> <title>%s</title> <link>%s</link> <description>%s</description> ''' % (conf,url,conf)) for item in items: iid,auth,uri,title,text = item fp.write(''' <item> <author>%s</author> <description>%s</description> <link>%s</link> <title>%s</title> </item>''' % (auth,cgi.escape(text),uri,title)) fp.write(' </channel>\n</rss>\n')