#!/usr/bin/perl -sw =head1 NAME htmltable2csv - Script to convert HTML tables to CSV =head1 VERSION 0.01 =head1 SYNOPSIS htmltable2csv file.html > file.csv curl -sS some/url | htmltable2csv > file.csv htmltable2csv --separator ' ' file.html > file.csv =head1 DESCRIPTION The name says it all, except for the C<--separator> option, which will be inserted in the CSV file between tables. The default is a new line character. =begin comment =head1 README Script to convert HTML tables to CSV =end comment =head1 BUGS Column and row spanning is currently ignored. Character encoding is not taken into account. If the file is in any encoding other than ISO Latin 1, it can easily be mangled. This script is currently slow. It would probably run faster if I made it use HTML::TableExtract or HTML::TableContentParser, but I couldn't be bothered to learn the former's interface and the latter does not support HTML entities, and I already know how to use the bloated monstrosity known as 'HTML::DOM,' even though its interface is horrible clunky. =head1 PREREQUISITES HTML::DOM 0.010 Text::CSV_XS =head1 SCRIPT CATEGORIES Web =head1 AUTHOR AND COPYRIGHT Copyright (C) 2007 Father Chrysostomos (gro.napc ta tuorps [backwards]) This program is free software; you may redistribute it and/or modify it under the same terms as perl. =head1 SEE ALSO L and L, which this script uses. L and L, or L, which this script would probably do well to use. L, which inspired me to write this (when I found that UPS's 2008 zone charts are HTML files with C<.xls> extensions [!]). =cut # OK, here’s the code: my $s = ${'-separator'}; defined $s or $s = "\n"; use Text::CSV_XS; #use HTML::TableContentParser; use HTML::DOM; my $tcx = new Text::CSV_XS { binary => 1}; # HTML::TableContentParser doesn’t support entities. #my $not_first; #for(@{new HTML::TableContentParser->parse(join '', <>)}){ # print $s if $not_first++; # for (@{$$_{rows}}) { # combine $tcx map $$_{data}, @{$$_{cells}}; # print +string $tcx, "\n"; # } #} my $not_first; my $doc = new HTML::DOM; $doc->write($_) while <>; for(getElementsByTagName $doc 'table'){ print $s if $not_first++; for (rows$_) { combine $tcx map as_text$_, cells$_; print +string $tcx, "\n"; } } # That’s it! Short, isn’t it?