#!/usr/bin/env python
# -*- coding: utf-8 -*-

# $Id: extract_onlinehelp_links.py 10982 2010-09-15 11:02:25Z pstorz $

import re
import pprint
from httplib import *

baseurl = 'http://www.bacula.org/5.0.x-manuals/en/main/main/'
htmlpages = {
  'client': 'Client_File_daemon_Configur.html',
  'storagedaemon':'Storage_Daemon_Configuratio.html',
  'autochanger':'Autochanger_Resource.html',
  'director': 'Configuring_Director.html',
  'console':'Console_Configuration.html',
  'monitor':'Monitor_Configuration.html',
  'messages': 'Messages_Resource.html',
}
childtablelinks = '''
<!--Table of Child-Links-->
<A NAME="CHILD_LINKS"><STRONG>Subsections</STRONG></A>

<UL CLASS="ChildLinks">

<LI><A NAME="tex2html1449"
  HREF="Configuring_Director.html#SECTION001810000000000000000">Director Resource Types</A>
<LI><A NAME="tex2html1450"
  HREF="Configuring_Director.html#SECTION001820000000000000000">The Director Resource</A>
<LI><A NAME="tex2html1451"
  HREF="Configuring_Director.html#SECTION001830000000000000000">The Job Resource</A>
<LI><A NAME="tex2html1452"
  HREF="Configuring_Director.html#SECTION001840000000000000000">The JobDefs Resource</A>
<LI><A NAME="tex2html1453"
  HREF="Configuring_Director.html#SECTION001850000000000000000">The Schedule Resource</A>
<LI><A NAME="tex2html1454"
  HREF="Configuring_Director.html#SECTION001860000000000000000">Technical Notes on Schedules</A>
<LI><A NAME="tex2html1455"
  HREF="Configuring_Director.html#SECTION001870000000000000000">The FileSet Resource</A>
<LI><A NAME="tex2html1456"
  HREF="Configuring_Director.html#SECTION001880000000000000000">FileSet Examples</A>
<LI><A NAME="tex2html1457"
  HREF="Configuring_Director.html#SECTION001890000000000000000">Backing up Raw Partitions</A>

<LI><A NAME="tex2html1458"
  HREF="Configuring_Director.html#SECTION0018100000000000000000">Excluding Files and Directories</A>
<LI><A NAME="tex2html1459"
  HREF="Configuring_Director.html#SECTION0018110000000000000000">Windows FileSets</A>
<UL>
<LI><A NAME="tex2html1460"
  HREF="Configuring_Director.html#SECTION0018110010000000000000">A Windows Example FileSet</A>
<LI><A NAME="tex2html1461"
  HREF="Configuring_Director.html#SECTION0018110020000000000000">Windows NTFS Naming Considerations</A>
</UL>
<BR>
<LI><A NAME="tex2html1462"
  HREF="Configuring_Director.html#SECTION0018120000000000000000">Testing Your FileSet</A>
<LI><A NAME="tex2html1463"
  HREF="Configuring_Director.html#SECTION0018130000000000000000">The Client Resource</A>
<LI><A NAME="tex2html1464"
  HREF="Configuring_Director.html#SECTION0018140000000000000000">The Storage Resource</A>

<LI><A NAME="tex2html1465"
  HREF="Configuring_Director.html#SECTION0018150000000000000000">The Pool Resource</A>
<UL>
<LI><A NAME="tex2html1466"
  HREF="Configuring_Director.html#SECTION0018151000000000000000">The Scratch Pool</A>
</UL>
<BR>
<LI><A NAME="tex2html1467"
  HREF="Configuring_Director.html#SECTION0018160000000000000000">The Catalog Resource</A>
<LI><A NAME="tex2html1468"
  HREF="Configuring_Director.html#SECTION0018170000000000000000">The Messages Resource</A>
<LI><A NAME="tex2html1469"
  HREF="Configuring_Director.html#SECTION0018180000000000000000">The Console Resource</A>
<LI><A NAME="tex2html1470"
  HREF="Configuring_Director.html#SECTION0018190000000000000000">The Counter Resource</A>
<LI><A NAME="tex2html1471"
  HREF="Configuring_Director.html#SECTION0018200000000000000000">Example Director Configuration File</A>

</UL>
<!--End of Table of Child-Links-->
'''


RXP_TABLE_OF_CHILDLINKS = re.compile('^<UL CLASS="ChildLinks">(.*)^</UL>', re.M|re.S)
#RXP_HREF  = re.compile('HREF="(?P<anchor>[^"]+)">(?P<title>.*)</A>')

RXP_HREF  = re.compile('HREF="(?P<anchor>[^"]+)">(The )?(?P<title>.*) Resource</A>')
#for href in RXP_TABLE_OF_CHILDLINKS.finditer(childtablelinks):
#  print href.groups()
  #print href.group('title'),href.group('anchor')



htmlhelp = {}

connection = HTTPConnection('www.bacula.org')
for config,page in htmlpages.iteritems():
  connection.request('GET', baseurl+page )  
  reply = connection.getresponse()
  print reply.status, reply.reason
  htmltext = reply.read()
  
  htmlhelp[config]={}
  
  for childlinks in RXP_TABLE_OF_CHILDLINKS.finditer(htmltext):
    #print childlinks
    for href in RXP_HREF.finditer(childlinks.group(0)):
      print config,href.group('title') + ' link: ' + href.group('anchor')
      htmlhelp[config][href.group('title').lower()]=baseurl+href.group('anchor')
 
pp = pprint.PrettyPrinter(indent=4)
     
pp.pprint(htmlhelp)