#!/usr/bin/python # pdf2htmlpres.py - Converts a PDF file to a series of image files # (currently JPEG) and makes a HTML slideshow out of them. Though # it can handle arbitrary PDF's it's meant to be used to generate # HTML versions of PDF presentations (which can be made with Prosper) # # Requires the program 'convert' to be in the users $PATH # # Rajarshi Guha # 14/11/2002 # # Updated 15/11/2002 - allowed the user to define image type, output dir # and put in some error checking # # Updated 16/11/2002 - made some changes to the change numbering and # also fixed it so that it will work properly for PDF's with just 1 # page! # # Updated 17/11/2002 - the script now uses Ghostscript directly to do # PDF conversion, since it seems convert in ImageMagick 5.4.7 cannot # convert a multi page PDF to graphic images. The script still needs the # mogrify program, since GS does'nt produce GIF output - we need to # convert GS generated JPEG's to GIF's. (PNG output uses the png256 # device in GhostScript) # # Update 18/11/2002 - the script now has 2 new options. The m switch can be used to # specify which conversion method to use. Either ImageMagick's convert of direct # GhostScript. So if you have a working convert (5.3.8 works) then use convert to # get smoother text in the graphic output. If your version of convert only produces # one page from the PDF, then use GhostScript as the converions method. # Using the -x and -y switches you can also specify the width and height respectively # of the main slide image. # # Update 21/11/2002 - added an option to allow control of the form of # the index (or TOC) page. Now with the -t or --toc option you can have # images, text or both on the index page. Requires the LaTeX source of # the PDF presentation to be present. # # Update 22/11/2002 - fixed a bug which casued crashes during parsing of # slide titles. Its fixed partially, but if the slide title has \emph{} # or other such sequences (ie if it contains {,}) the extracted title # will not be correct. Also fixed a Python 2.2 idiom so that the code # also works with Python 1.5.2 # # Update 26/11/2002 - Thanks to Alain Bannay for sending in a patch to # take into account slides generated using the \part command as well an # improved regex to get slide titles of the form # \begin{slide}{\command{Title}} etc (ie multiply nested LaTeX # commands). # # Update 28/10/2003 - Thanks to Ki Joo Kim for pointing out that the # slides are sorted according to filenames so slide10.jpg comes before # slide2.jpg. This is corrected so that slides are in the proper order # I've also updated the PNG output from GS # # Update 11/04/2004 - Thanks to Paolo Costa for pointing out a problem with # name generation when using ImageMagick to convert PDF's to image files. # # Update 17/11/2005 - Thanks to Caner Kazanci for pointing out an error # when Imagemagick was used to convert the PDF. The resultant images were # misnamed and not recognized by subsequent code to actually put the HTML # version together import os, string, sys import getopt, re class tc: def __init__(self, fn, num): self.fn = fn self.num = num def sort_by_attr(seq, attr): import operator intermed = map(None, map(getattr, seq, (attr,)*len(seq)), xrange(len(seq)), seq) intermed.sort() return map(operator.getitem, intermed, (-1,) * len(intermed)) def sort_by_attr_inplace(lst, attr): lst[:] = sort_by_attr(lst, attr) def arrowtext(prev,next): l = '' s = """ """ if prev and next: l = """ """ % (prev, next) elif not prev: l = """ """ % (next) elif not next: l = """ """ % (prev) s = s+l+'\n\n

\n\n' return(s) def toc(slides, outputdir, toctype = 'image'): slidetitles = [] if toctype in ['text', 'both']: texfile = string.split(pdffile,'.')[0]+'.tex' try: f = open(texfile,'r') except IOError: print """ To generate the table of contents, the original LaTeX source must be present in the same diretory as the PDF. If the PDF is called file.pdf, the LaTeX source should be file.tex """ # Perform the cleanup os.system('rm -rf '+outputdir) sys.exit(0) # OK, so we have the LaTeX source. Now read in slide headings # The number of headings should be the same as the length of # slides[]. Important point, check for overlaid slides - since # they only have one slide header, but will generate N slides # for N overlays. in_overlay = 0 in_slide = 0 numoverlay = 0 while 1: line = f.readline() if not line: break if string.find(line, '\overlays{') >= 0: in_overlay = 1 m = re.compile(r'[0-9]*').findall(line) for i in m: try: numoverlay = int(i) break except: pass if string.find(line, '\part') >= 0: m = re.compile(r'\part(\[.*\])?.*{(.*[^}])}').findall(line) print m slidetitles.append(m[0][1]) if string.find(line, 'begin{slide}') >= 0: in_slide = 1 m = re.compile(r'{(.*)}.*{(.*[^}])}').findall(line) print m slidetitles.append(m[0][1]) if in_slide and in_overlay: # Add the extra slide title to the list # we only need to add N-1 copies of the last appended curtitle = slidetitles[len(slidetitles)-1] for i in range(0, numoverlay-1): slidetitles.append(curtitle) numoverlays = 0 in_overlay = 0 # Done with overlays for this slide if string.find(line,'\end{slide}'): in_slide = 0 # check that slide titles match number of slides # It should be off by 1 since the title slide, does not have a # text title as such if len(slidetitles) != len(slides)-1: print """ The number of titles in the LaTeX source (including overlay slides) does not match the number of slides. First make sure that the PDF file you are converting is generated from the LaTeX file being used (%s). Otherwise it's probably an error in the algorithm. Mail me :) """ % (texfile) sys.exit(0) # Add a blank title to the front of slidetitles, for the title # slide slidetitles.insert(0,'') else: # if we just want images only, make all slide titles NULL for i in range(0, len(slides)+1): slidetitles.append('') slideinfo = zip(slides, slidetitles) # Combine the slide file & title f = open(outputdir+'/left.html','w') f.write(head) pagecounter = 1 for slide,title in slideinfo: page = 'page%d.html' % (pagecounter) if toctype == 'image': link = """

""" % (page, slide) elif toctype == 'text': link = """ %s

""" % (page, title) elif toctype == 'both': link = """

""" % (page, slide, title) f.write(link) pagecounter = pagecounter + 1 f.write('\n') f.close() ########################################## def usage(): print """ Usage: pdf2htmlpres [options] Converts the PDF presentation to HTML form, each slide on its own page -f, --file Required! Specify the PDF file to convert -g, --graphic The argument can be either png|jpg|gif Default is jpg -d, --dir The output directory for the HTML pages. Defaults to ./html/ -m, --method Argument can be im|gs for ImageMagick's convert or GhostScript respectively. Use -m gs if your version of convert produces a single image from the multi page PDF The default is ImageMagick's convert -x, --x width of the main image (in pixels) -y, --y height of main image (in pixels) (Its a good idea to specify bopth x and y, since only specifying width or only height causes the other to be automatically the same) -t, -toc Argument can be image|text|both. Controls the form of the index page (or TOC). The default is images (ie thumbnails of the slides) only. -h, --help This message """ sys.exit(1) ########################################## # Default values global pdffile toctype = 'image' pdffile = '' gfxsuffix = 'jpg' outputdir = './html' method = 'im' width = '' height = '' if len(sys.argv) == 1: usage() try: opt,args = getopt.getopt(sys.argv[1:], "t:m:x:y:hd:f:g:", ["toc=","method=","x=","y=","dir=","help","file=","graphic="]) except getopt.GetoptError: usage() for o,a in opt: if o in ('-h','--help'): usage() if o in ('-f', '--file'): pdffile = a if o in ('-g', '--graphic'): if a not in ['jpg', 'JPG', 'jpeg', 'PNG', 'GIF', 'png', 'gif']: usage() gfxsuffix = a if o in ('-d', '--dir'): outputdir = a if o in ('-m','--method'): if a not in ['im','IM','gs','GS']: usage() method = string.lower(a) if o in ('-x','--x'): width = a if o in ('-y','--y'): height = a if o in ('-t','--toc'): if string.lower(a) not in ['image', 'text', 'both']: usage() toctype = a # Check user supplied args if pdffile == '': print 'You need to supply a PDF to convert!' sys.exit(1) try: f = open(pdffile,'rb') except: print 'Error opening '+pdffile sys.exit(1) if f.read(4) != '%PDF': print 'The supplied file doesnt seem to be a PDF :(' sys.exit(1) else: f.close() ########################################### # # OK ,lets start! try: os.mkdir(outputdir) except: # Probably exists. Currently, just erase it and make it os.system('rm -rf '+outputdir) os.mkdir(outputdir) # Lets convert the PDF print 'Converting PDF to '+string.upper(gfxsuffix) if method == 'gs': # Use GhostScript to do the conversion if gfxsuffix in ['jpg','JPG','jpeg','GIF','gif']: device = 'jpeg' suffix = 'jpg' else: device = 'png16m' suffix = 'png' if device == 'jpeg': os.system('gs -dNOPAUSE -dQUIET -sBATCH -sDEVICE=%s -sOUTPUTFILE=%s/page%%d.%s %s' % (device, outputdir, suffix, pdffile)) elif device == 'png16m': os.system('gs -dNOPAUSE -dQUIET -sBATCH -sDEVICE=%s -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -sOUTPUTFILE=%s/page%%d.%s %s' % (device, outputdir, suffix, pdffile)) # if the user wanted gif's we need to convert from jpeg, since gs # does'nt produce gifs if gfxsuffix in ['gif', 'GIF']: os.system('mogrify -format gif %s/*.jpg' % (outputdir)) os.system('rm %s/*.jpg' % (outputdir)) else: # Use ImageMagick to do the conversion cmdline = 'convert %s %s/%s' % (pdffile, outputdir, 'page%d.'+gfxsuffix) os.system(cmdline) # Generate the slide show print 'Generating HTML pages...' head = """ """ foot = """ """ # Get the image files and sort them import re slides = os.listdir(outputdir) l = [] rgx = re.compile('page([0-9]*)\.') for i in slides: fn = i num = int(re.findall(rgx,i)[0]) l.append( tc(fn,num) ) sort_by_attr_inplace(l,'num') slides = [] for i in l: slides.append( i.fn ) pagecounter = 1 # Create the page for each slide if len(slides) == 1: page = slides[0] f = open('%s/page%d.html' % (outputdir,pagecounter),'w') f.write(head) f.write('

' % (page, page)) f.write(foot) f.close() else: for i in range(0, len(slides)): # Get the files for prev & next slide if i == 0: prev = '' next = 'page%d.html' % (pagecounter+1) elif i == len(slides)-1: prev = 'page%d.html' % (pagecounter-1) next = '' else: prev = 'page%d.html' % (pagecounter-1) next = 'page%d.html' % (pagecounter+1) page = slides[i] f = open('%s/page%d.html' % (outputdir,pagecounter) ,'w') f.write(head) f.write(arrowtext(prev,next)) if not width and not height: f.write('

' % (page, page)) elif width and height: f.write('

' % (page, width, height, page)) elif width: f.write('

' % (page, width, page)) elif height: f.write('

' % (page, height, page)) f.write(arrowtext(prev,next)) f.write(foot) f.close() pagecounter = pagecounter + 1 # Write the CSS style file f = open(outputdir+'/style.css','w') f.write(""" .tback { font-size: large; font-weight: bold; background-color:#CCCCFF; padding: .5em; }""") f.close() # Write the index page toc(slides, outputdir, toctype) # Write the frameset page f = open(outputdir+'/index.html','w') s = """ """ % ( ',', 'page1.html' ) f.write(s) f.close()