On Mon, 26 Jun 2006 18:17:47 GMT, Pet @
www.gymratz.co.uk ;¬) <PeTe33 (AT) gymratz (DOT) co.uk> wrote:
Quote:
Can anyone recommend a script that makes a google friendly site-map?
I have tried a few in the past, but they always fail dismally.
I'll even pay a few quid if the tool is right! |
here's mine. I gave up on CPAN perl module because it is too slow. It
does some things that are special for me, you can figure them out and
adjust. My sitemaps are huge and hole easily a million entries.
#!/usr/bin/perl
use strict;
use DateTime::Precise;
use utf8;
#use XML::Simple;
use Algebra::UrlEncoding;
use WWW::Google::SiteMap;
my $maxcount = 49000;
my $usage = "$0 --site (algebra|cooldictionary) mapfile {logfiles}";
sub mapfile_name {
my ($base, $b) = @_;
return sprintf( "%s.%04d.gz", $base, $b );
}
sub url2xml {
my $url = shift;
$url =~ s/\&/\&\;/g;
#$url =~ s/\'/\&apos\;/g;
return $url;
}
sub algebra_filter {
my ($url) = (@_);
#print "$url\n";
return undef unless $url =~ /^\/algebra\/about\/history\//;
return undef if $url =~ /\?/;
return "http://www.algebra.com$url" if $url =~ /wikipedia$/;
return undef;
}
sub cooldictionary_filter {
my ($url) = (@_);
return "http://www.cooldictionary.com$url" if $url =~ /^\/words/;
return undef;
}
sub open_map {
my ($base, $no) = @_;
my $mfn = mapfile_name( $base, $no );
print STDERR "Creating map '$mfn'.\n";
open( MAP, "| gzip -c >$mfn" );
print MAP '<?xml version=\'1.0\' encoding=\'UTF-8\'?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84 http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">
';
}
my $site = undef;
my $site_filter = undef;
my $done;
do {
$done = 0;
if( $ARGV[0] eq '--site' ) {
shift;
$site = shift || die $usage;
$done = 1;
if( $site eq 'algebra' ) {
$site_filter = \&algebra_filter;
} elsif( $site eq 'cooldictionary' ) {
$site_filter = \&cooldictionary_filter;
} else {
die "$usage (unknown site $site)";
}
}
} while( $done );
my $mapfile = shift @ARGV || die "$usage (no mapfile)";
die "$usage (no site)" unless $site;
my $bigcount = 0;
open_map( $mapfile, $bigcount );
my $ago = new DateTime::Precise;
$ago->inc_day( -90 );
# 2005-05-07T00:45:10+00:00
my $lastmod = $ago->dprintf( "%^Y-%M-%D" );
my $used_locs = {};
my $count = 0;
foreach my $fn (@ARGV) {
if($fn =~ /\.gz$/ ) {
open( F, "gunzip -c $fn|" ) || die "Cannot unzip $fn";
} else {
open(F, $fn ) ||die "Cannot open $fn";
}
while(<F>) {
chomp;
next unless '\"GET /';
s/^.*\"GET // || next;
s/\s+.*$//;
#print "$_!\n";
#print "L=$_.\n";
$_ = deurlstr( $_ );
next if $used_locs->{$_};
$used_locs->{$_} = 1;
my $url = &$site_filter( $_ );
next unless $url;
#print "URL=$url\n";
next if $url =~ /[\x7f-\xFF]/;
next if $url =~ /(\%|\<|\>|\')/;
$url = url2xml( $url );
print MAP "<url><loc>$url</loc><lastmod>$lastmod</lastmod></url>\n";
$count++;
if($count > $maxcount ) {
$bigcount++;
$count = 0;
print MAP "</urlset>\n";
close( MAP );
open_map( $mapfile, $bigcount );
#print STDERR "Wrote map, starting another ($bigcount).\n";
}
}
close( F );
}
print MAP "</urlset>\n";
close( MAP );