import os,re,pickle
from xml.dom import minidom
from xml import xpath

def getDetails():
	mainxpath = "//div[@class='left w420']/div[@class='contentpadlr']/"
	files = [x for x in os.listdir('.') if x.endswith('.xhtml')]

	cars = []

	for f in files:
	  dom = minidom.parse(f)
	  names = [x.childNodes[1].nodeValue.replace('\n',' ').strip() 
	    for x in xpath.Evaluate(mainxpath+"h4",
	  dom.documentElement)]
	  positions = [int(x.childNodes[0].firstChild.nodeValue) 
	    for x in xpath.Evaluate(mainxpath+"h4",
	  dom.documentElement)]
	  uls = [x
	    for x in xpath.Evaluate(mainxpath+"ul[@class='nobull']",
	  dom.documentElement)]
	  files = [f] * len(names)
	  cars += zip(names,positions,uls,files)

	carsdict = {}
	for n,p,ul,fil in cars:
	  names = [x.firstChild.nodeValue
	    for x in xpath.Evaluate("li/div[@class='w240 left']",ul)]
	  values = [x.getAttribute('alt').split()[0]
	    for x in xpath.Evaluate("li/div[@class='left']/img",ul)]
	  scores = dict(zip(names,values))
	  haspre = re.search(r'\((.*)\)',n)
	  if haspre:
	    pre = haspre.groups()[0]
	  else:
	    pre = ""
	  manufacturer, model = re.sub(r'\((.*)\)','',n).split(' ',1)
	  if manufacturer in ['Alfa','Land']:
	    m1,model = model.split(' ',1)
	    manufacturer = manufacturer + ' ' + m1
	  carsdict[n] = {"position": p, "scores": scores, "manufacturer": manufacturer.encode('utf-8'),
	                 "model": model.encode('utf-8'), "pre": pre, "file": fil}

	# dump for faster move later
	fp = open('cache.pickle','wb')
	pickle.dump(carsdict,fp)
	fp.close()
	return carsdict

def uniq(s): return dict(zip(s,[1]*len(s))).keys()

# main
if os.path.exists('cache.pickle'):
  fp = open('cache.pickle','rb')
  carsdict = pickle.load(fp)
  fp.close()
else:
  carsdict = getDetails()

# write out output
manufacturers = [c["manufacturer"] for c in carsdict.values()]
manufacturers = uniq(manufacturers)
manufacturers.sort()

print '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<title>Top Gear Survey easy access edition</title>
</head>
<body>
<p>This is the easy access version of the <a
href="http://www.topgear.com/content/features/stories/2005/11/stories/01/1.html">BBC
Top Gear 2005 car survey</a>. While the survey itself is massively useful, it's
difficult to navigate; you have to look through all 150-odd entries to find the
car you want. Here's a handy index to all the cars in the survey, with links
to let you jump to each.</p>
<p id="top">
'''
links = ['<a href="#%s">%s</a>' % (m.lower().replace(' ','_'),m)
       for m in manufacturers]
print ' '.join(links)

print '</p><dl>'
for m in manufacturers:
  cars = [c for c in carsdict.values() if c['manufacturer'] == m]
  posns = [c['position'] for c in cars]
  avg = sum(posns) / len(posns)
  print '<dt id="%s">%s (avg. posn. %s) <a href="#top">(top)</a></dt>' % (
    m.lower().replace(' ','_'), m, avg
  )
  print '<dd><ul>'
  for c in cars:
    print '<li><a href="%s%s">%s %s: place %s</a></li>' % (
          'http://www.topgear.com/content/features/stories/2005/11/stories/01/',
          c['file'].replace('.xhtml','.html'),
          c['manufacturer'], c['model'], c['position']
          )
  print '</ul></dd>'
print '</dl>'
print '</body></html>'

