#!/usr/bin/perl # (C) 2002 Yonat Sharon # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. use strict; use Text::Template; # documentation: sub usage { print < headings. Usage: split.pl [-h N] [-index ] [-page ] source HTML file to split -h N split on

to (default is 3) -index template file to use for the index page (default is index-template.html) -page template file to use for the section pages (default is page-template.html) See splity.html for details on the format of the source HTML file and the template files. END exit; } return 1 if ($0 =~ /splity.t$/); usage() unless (@ARGV); # init parameters: my $source = pop; # TODO: default to 'all.html' my %param = @ARGV; my $max_h_level = $param{'-h'} || 3; my $index_template = $param{'-index'} || 'index-template.html'; my $page_template = $param{'-page'} || 'page-template.html'; # extract data: my $s = slurp_file($source); my ($opening, $closing, @sections) = split_sections($s, $max_h_level); my $doc_vars = {'opening' => $opening, 'closing' => $closing, 'source' => $source}; my @pages; foreach my $section (@sections) { push(@pages, extract_page_data($section)); } create_structure(\@pages); # fill-in empty names and titles: for (my $i = 0; $i < scalar(@pages); ++$i) { $pages[$i]->{'name'} ||= $i || 'index'; # TODO: should be according to parent, like 1.4.2 $pages[$i]->{'title'} ||= $pages[$i]->{'name'}; } # create index my $index_page = shift @pages; my $tt = new Text::Template(SOURCE => $index_template) or die "Can't construct template $index_template: $Text::Template::ERROR"; output_page($tt, $index_page, $doc_vars); # create pages $tt = new Text::Template(SOURCE => $page_template) or die "Can't construct template $page_template: $Text::Template::ERROR"; foreach my $page (@pages) { output_page($tt, $page, $doc_vars); } ################################################################################ ### output_page($template, $page) # Create output file for $page using $template. sub output_page { my ($template, $page, $doc_vars) = @_; open(F, "> $page->{'name'}.html") or die "Can't open $page->{'name'}.html for writing: $!"; $template->fill_in(HASH => [$page, $doc_vars], OUTPUT => \*F) or die "Can't fill_in template for $page->{'name'}: $Text::Template::ERROR"; close F; } ### $s = slurp_file($filename) # Read the whole file into a string. ############ sub slurp_file { local $/; open(F, $_[0]) or die "Can't open $_[0]: $!"; my $s = ; close(F); return $s; } ### ($opening, $closing, @sections) = split_sections($html, $max_h_level) ###### ### Split HTML file to sections according to headings 1-$max_h_level. sub split_sections { # split to @sections, and common $opening and $closing my $level = qr/[1-$_[1]]/i; my @sections = split(/(?=]+style="[^"]*display\s*:\s*none\s*;/is; if ($sections[$#sections] =~ /^$closing_tag/) { $closing = pop @sections; } else { $sections[$#sections] =~ s!($closing_tag.*)!!is; $sections[$#sections] =~ s!( in heading ### title_tag - complete heading tag ### content - everything but the heading ### excerpt - content of tag (if exists) sub extract_page_data { local $_ = $_[0]; my $page; my $title; # extract title s!<(h(\d)[^>]*)>(.*?)!!is; $page->{'title_tag'} = $1; $page->{'level'} = $2; $title = $3; # extract $page->{'name'} = $1 if $title =~ s!]+name="(.*?)"[^>]*>(.*?)!$2!is; # TODO: quotes should not be required $title =~ s!\s*\n! !g; $page->{'title'} = $title; # extract excerpt $page->{'excerpt'} = $1 if s!([^<]*)!$1!i; # redirect internal links s!\bhref="#(.*?)"!href="$1.html"!gi; # TODO: only inside A tags! $page->{'content'} = $_; return $page; } ### create_structure(\@array_of_hash_refs) ##################################### ### Create linking fields between @array elements, based on their 'level' field: ### top - first element ### prev - previous element ### next - next element ### up - parent element ### sub - array of child elements (direct children only) sub create_structure { my $array = shift; my $top = $array->[0]; my $level = $top->{'level'}; my $last; foreach my $curr (@$array) { $curr->{'top'} = $top; if ($last) { $curr->{'prev'} = $last; $last->{'next'} = $curr; # find parent: my $curr_level = $curr->{'level'}; my $last_level = $last->{'level'}; if ($last_level == $curr_level) { # same level $curr->{'up'} = $last->{'up'}; } elsif ($last_level < $curr_level) { # sub-section $curr->{'up'} = $last; } else { # up-section my $up = $last->{'up'}; while ($up->{'level'} >= $curr_level) { $up = $up->{'up'}; } $curr->{'up'} = $up; } push(@{$curr->{'up'}->{'sub'}}, $curr); } $last = $curr; } } ### $html = sub_list(\@sub, $depth, $show_excerpts, $item_tag, $sub_list_tag) ## ### Return HTML multi-level list of descendants. ### \@sub - direct children ### $depth - show descendants up to $depth level (0 means show all) ### $excerpt_format - format string for excerpt, with '$1' as fill-in place. ### e.g., '"$1 ..."' ### $item_tag - wrap each descendant with this tag (eg 'li') ### $sub_list_tag - wrap descendants' sub-lists with this tag (eg 'ul') sub sub_list { my $sub = shift; join("\n" , map {sub_list_item($_, @_)} @$sub); } sub sub_list_item { my ($page, $depth, $excerpt_format, $item_tag, $sub_list_tag) = @_; my $html = qq($page->{'title'}); if ($excerpt_format and $page->{'excerpt'}) { $excerpt_format =~ s/\$1/$page->{'excerpt'}/; $html .= $excerpt_format; } if ($depth != 1 and $page->{'sub'}) { --$depth if ($depth > 0); $html .= "<$sub_list_tag>"; $html .= sub_list($page->{'sub'}, $depth, $excerpt_format, $item_tag, $sub_list_tag); $html .= ''; } return "<$item_tag>$html'; } ### $closing_tag = end_tag($tag_with_attributes) ############################### sub end_tag { my $tag = $_[0]; $tag =~ s/\W.*//s; return $tag; } ### $text = strip_tags($html) # Strip tags out of $html. ####################### sub strip_tags { local $_ = $_[0]; s/<.*?>//gs; s/^\s*//s; s/\s*$//s; return $_; }