#!/usr/bin/perl # # Feedextractor # resembles feed43.com, takes the same arguments # use strict; use warnings; use Data::Dumper; use Config::General qw(ParseConfig); use Digest::MD5 qw(md5 md5_hex md5_base64); use DB_File; use LWP::UserAgent; #use POSIX qw(strftime); no locale; use DateTime; my %db; my %conf; my $version = 0.1; my $only; sub ts { my $dt = DateTime->now('time_zone' => 'Europe/Berlin'); return $dt->strftime("%a, %d %b %Y %H:%M:%S %z"); } sub httpget { my $url = shift; my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/5.0 (X11; U; Linux i586; en-US; rv:1.7.3) Gecko/20040924 Feedextrator/Version (Ubuntu)"); my $req = HTTP::Request->new(GET => $url); $req->header(Accept => "text/html, */*;q=0.1"); $req->header("Accept-Charset" => "utf-8"); $req->header("Accept-Language" => "de-DE"); my $res = $ua->request($req); if ($res->is_success) { return (1, $res->content); } else { return (0, $res->status_line); } } sub fetch { my $conf = shift; my $regex = $conf->{regex}; $regex =~ s/\*/.+?/g; $regex =~ s#/#\\/#g; $regex =~ s/\{(.+?)\}/(?<$1>.+?)/g; my $news = &httpget($conf->{url}); if(exists $conf->{clean}) { my $code = "\$news =~ $conf->{clean};"; eval $code; } my $xml = ""; if(exists $conf->{iteregex}) { my $iteregex = $conf->{iteregex}; $iteregex =~ s/\*/.+?/g; $iteregex =~ s#/#\\/#g; $iteregex =~ s/\{(.+?)\}/(?<$1>.+?)/g; while ($news =~ /$iteregex/gs) { my $item = $conf->{item}; my %itermatches = %+; if (exists $itermatches{LINK}) { my $entry = &httpget($itermatches{LINK}); if ($entry =~ /$regex/gs) { my %matches = %+; $matches{LINK} = $itermatches{LINK}; my $subst = "$matches{TITLE}$matches{LINK}"; my $guid = md5_base64($subst); my $ts = &ts(); if (exists $db{$guid} && !$only) { $ts = $db{$guid}; } else { $db{$guid} = $ts; } foreach my $key (keys %matches) { $matches{$key} =~ s/\s*$//; $matches{$key} =~ s/^\s*//; $item =~ s/\{$key\}/$matches{$key}/gse; } $item =~ s/\{TS\}/$ts/g; $item =~ s/\{MD5\}/$guid/g; $xml .= $item; } } } } else { while ($news =~ /$regex/gs) { my $item = $conf->{item}; my %matches = %+; my $subst = "$matches{TITLE}$matches{LINK}"; my $guid = md5_base64($subst); my $ts = &ts(); if (exists $db{$guid} && !$only) { $ts = $db{$guid}; } else { $db{$guid} = $ts; } foreach my $key (keys %matches) { $matches{$key} =~ s/\s*$//; $matches{$key} =~ s/^\s*//; $item =~ s/\{$key\}/$matches{$key}/gse; } $item =~ s/\{TS\}/$ts/g; $item =~ s/\{MD5\}/$guid/g; $xml .= $item; #print Dumper(\%+); } } return $xml; } sub get { my ($feed, $debug) = @_; if(! exists $conf{feed}->{$feed}) { print STDERR "Feed $feed is not configured!\n"; return; } my $xml = &fetch($conf{feed}->{$feed}); my $h = $conf{header}; my $f = $conf{footer}; my $ts = &ts(); $h =~ s/\{NAME\}/$conf{feed}->{$feed}->{name}/gs; $h =~ s/\{TS\}/$ts/gs; $h =~ s/\{URL\}/$conf{feed}->{$feed}->{url}/gs; if(! $debug) { open XML, ">$conf{xmldir}/$feed.xml" or die "Could not write feed to $conf{xmldir}/$feed.xml: $!\n"; select XML; binmode(XML, ":utf8"); } else { binmode(STDOUT, ":utf8"); } print "$h\n$xml\n$f\n"; if(! $debug) { close XML; } } %conf = ParseConfig(-ConfigFile => shift); tie %db, 'DB_File', $conf{md5db}, O_RDWR|O_CREAT, 0777, $DB_HASH or die $!; $only = shift; if($only) { &get($only, 1); } else { $only = 0; foreach my $feed (keys %{$conf{feed}}) { &get($feed); } } untie %db;