summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGavin Hurlbut <ghurlbut@mythtv.org>2011-12-14 09:13:09 (GMT)
committer Gavin Hurlbut <ghurlbut@mythtv.org>2011-12-14 09:17:31 (GMT)
commitb7c2e38fa5fc67c48423ac219d9c5a2908b8cc00 (patch)
tree1ad22e7f28d22a671178d1f561c09d0a4ce03171
parentd53a234ff309eda7ec0801ffb8767c9a30f9181f (diff)
Redo BBC Weather source
Seems the BBC had the urge to prettify (and break) their weather pages. The search now returns a different ID than the RSS feeds that we have been using. Luckily, the RSS ID number seems to still be buried in the HTML, and they are saying they'll have the RSS feeds linked "very soon". Meanwhile this should work. This adds another requirement for Perl modules: JSON. Fixes #10204
-rwxr-xr-xmythplugins/configure1
-rw-r--r--mythplugins/mythweather/mythweather/scripts/uk_bbc/BBCLocation.pm270
-rwxr-xr-xmythplugins/mythweather/mythweather/scripts/uk_bbc/bbccurrentxml.pl61
-rwxr-xr-xmythplugins/mythweather/mythweather/scripts/uk_bbc/bbcthreedayxml.pl59
4 files changed, 262 insertions, 129 deletions
diff --git a/mythplugins/configure b/mythplugins/configure
index b93e4af..0a0c145 100755
--- a/mythplugins/configure
+++ b/mythplugins/configure
@@ -460,6 +460,7 @@ if ! disabled weather; then
check_pl_lib "DateTime::Format::ISO8601" ||
disable_weather "DateTime::Format::ISO8601"
check_pl_lib "SOAP::Lite" || disable_weather "SOAP::Lite"
+ check_pl_lib "JSON" || disable_weather "JSON"
fi
if test "$dcraw" != "no" ; then
diff --git a/mythplugins/mythweather/mythweather/scripts/uk_bbc/BBCLocation.pm b/mythplugins/mythweather/mythweather/scripts/uk_bbc/BBCLocation.pm
index 95507d5..de6693c 100644
--- a/mythplugins/mythweather/mythweather/scripts/uk_bbc/BBCLocation.pm
+++ b/mythplugins/mythweather/mythweather/scripts/uk_bbc/BBCLocation.pm
@@ -1,120 +1,212 @@
+#! /usr/bin/perl
+# vim:ts=4:sw=4:ai:et:si:sts=4
+
package BBCLocation;
use strict;
+use warnings;
require Exporter;
-use LWP::Simple;
-our @EXPORT = qw(Search);
-our $VERSION = 0.2;
+use utf8;
+use encoding 'utf8';
+use LWP::UserAgent;
+use JSON;
+use XML::XPath;
+use XML::XPath::XMLParser;
+use URI::Escape;
+
+
+our @EXPORT = qw(Search FindLoc);
+our $VERSION = 0.3;
my @searchresults;
my @resulturl;
my $resultcount = -1;
sub Search {
+ my ($search_string, $dir, $timeout, $logdir) = @_;
+ $search_string = uri_escape($search_string);
- my $base_url = 'http://news.bbc.co.uk/weather/util/search/Search.xhtml?';
- my $world_base_url = $base_url . 'lowgraphics=true&region=world&search=';
- my $local_base_url = $base_url . 'lowgraphics=true&region=uk&search=';
+ my $base_url = 'http://www.bbc.co.uk/locator/client/weather/en-GB/' .
+ 'search.json';
+ my $search_url = $base_url . '?ptrt=/&search=';
- my $search_string = shift;
- my $world_response = get $world_base_url . $search_string;
- my $local_response = get $local_base_url . $search_string;
+ my $file = $search_string;
+ getCachedJSON($search_url . $search_string, $dir, $file, $timeout, $logdir);
- &parseResults($world_response) if defined($world_response);
- &parseResults($local_response) if defined($local_response);
+ my $cachefile = "$dir/$file.json";
+ my $cachefile1 = "$dir/$file-results.html";
+ my $cachefile2 = "$dir/$file-pagination.html";
- if ( ($resultcount > 0 ) && ($#searchresults < 0) ) {
- foreach my $url (@resulturl) {
- my $url_response = get $base_url . $url;
- die unless defined $url_response;
- &parseResults($url_response);
- }
- }
+ open OF, "<:utf8", $cachefile or die "Can't read $cachefile: $!\n";
+ my $content = do { local $/; <OF> };
+ close OF;
- return @searchresults;
-}
+ my $decoded = decode_json $content;
+ $resultcount = $decoded->{"noOfResults"};
-sub parseResults {
- my $response = shift;
- my $isresults = 0;
- my $resultline = "";
-
- # Initialise a hash for the $locid & $locname results.
-
- # Use of a hash indexed by $locid ensures that more informative results
- # (e.g. "Sale, Australia" vs. "Sale") coming from <p class="response">
- # section will overwrite less informative results coming from
- # <span id="printbutton_Forecast"> section
-
my %loc_hash = ();
- foreach (split("\n", $response)) {
-
- # Declare a result if either the '<p class="response"> OR <span id="printbutton_Forecast"> strings are found
- # This ensures that single and multiple matches are caught
-
- if (/<p class=\"response\">/ || /<span id=\"printbutton_Forecast\">/) {
- $isresults = 1;
+ get_results($cachefile1, \%loc_hash);
+
+ if (exists $decoded->{"pagination"}) {
+ my %pages = ();
+ my $xp = XML::XPath->new(filename => $cachefile2);
+ my $nodeset = $xp->find("//ol/li/a");
+ foreach my $node ($nodeset->get_nodelist) {
+ my $url = $node->getAttribute("href");
+ my $num = $node->string_value;
+ $url =~ s/&amp;/&/;
+ $num =~ s/ //g;
+ $pages{$num} = $url;
}
- my $locname;
- my $locid;
- my $url;
-
- if ($isresults) {
- last if (/There are no forecasts matching/);
-
- $resultcount = $1 if (/<strong>There \w{2,3} (\d*) forecasts? matching/);
-
- # Collect result URLs
- if (/<a id=\"result_\d*\" .* href \=\"?.*search\=.*/) {
- $url = $_;
- $url =~ s/.*href \=\"(.*)\".*/$1/s;
- push (@resulturl, $url);
- }
-
- # Collect location IDs and location names
- elsif (/<a id=\"result_\d*\" .* href \=\"\/weather\/forecast\//) {
- $locid = $_;
- $locid =~ s/.*\/weather\/forecast\/(\d{0,5})\?.*/$1/s;
-
- $locname = $_;
- $locname =~ s/.*<a id=\"result_\d*\".*>(.*)<\/a>.*/$1/s;
-
- $resultline = $locid . "::" . $locname;
-
- $loc_hash{$locid} = $locname;
- }
-
- # Extract location ID and name from "Print <location>" link
-
- # This string is always present (provided valid search result - invalid results are caught above)
- # irresepective of whether single or multiple matches are returned
-
- elsif (/<span id=\"printbutton_Forecast\"><a title=\"Print (.+)\" href=\"\/weather\/forecast\/(\d{0,5})?/o)
- {
- $locid = $2;
- $locname = $1;
-
- $loc_hash{$locid} = $locname;
- }
+ foreach my $page (keys %pages)
+ {
+ getCachedJSON($base_url . $pages{$page}, $dir, $file . "-$page",
+ $timeout, $logdir);
+
+ my $cachefile3 = "$dir/$file-$page-results.html";
+ get_results($cachefile3, \%loc_hash);
}
}
-
- # Loop through contents of %loc_hash, check for existence within @searchresults, and add as necessary
-
+
+ my @searchresults = ();
foreach my $key (keys %loc_hash)
{
my $resultline = $key."::".$loc_hash{$key};
-
- if (! grep(/^$key/, @searchresults))
- {
- push (@searchresults, $resultline);
- }
+ push (@searchresults, $resultline);
}
-
return @searchresults;
}
+sub getCachedJSON {
+ my ($url, $dir, $file, $timeout, $logdir) = @_;
+
+ my $cachefile = "$dir/$file.json";
+ my $cachefile1 = "$dir/$file-results.html";
+ my $cachefile2 = "$dir/$file-pagination.html";
+
+ my $now = time();
+ my $decoded;
+
+ log_print( $logdir, "Loading URL: $url\n" );
+
+ if( (-e $cachefile) and ((stat($cachefile))[9] >= ($now - $timeout)) ) {
+ # File cache is still recent.
+ log_print( $logdir, "cached in $cachefile\n" );
+ } else {
+ log_print( $logdir, "$url\ncaching to $cachefile\n" );
+
+ my $ua = LWP::UserAgent->new;
+ $ua->timeout(30);
+ $ua->env_proxy;
+ $ua->default_header('Accept-Language' => "en");
+
+ my $response = $ua->get($url);
+ if ( !$response->is_success ) {
+ die $response->status_line;
+ }
+
+ open OF, ">:utf8", $cachefile or die "Can't open $cachefile: $!\n";
+ print OF $response->content;
+ close OF;
+
+ $decoded = decode_json $response->content;
+
+ open OF, ">:utf8", $cachefile1 or die "Can't open $cachefile1: $!\n";
+ print OF "<html>".$decoded->{"results"}."</html>";
+ close OF;
+
+ if (exists $decoded->{"pagination"}) {
+ open OF, ">:utf8", $cachefile2 or
+ die "Can't open $cachefile2: $!\n";
+ print OF "<html>".$decoded->{"pagination"}."</html>";
+ close OF;
+ } else {
+ unlink $cachefile2;
+ }
+ }
+}
+
+sub get_results {
+ my ($file, $outhash) = @_;
+
+ my $xp = XML::XPath->new(filename => $file);
+ my $nodeset = $xp->find("//ul/li/a");
+ foreach my $node ($nodeset->get_nodelist) {
+ my $url = $node->getAttribute("href");
+ my $loc = $node->string_value;
+
+ $url =~ s/^\/weather\///;
+ $outhash->{$url} = $loc;
+
+ print "$url"."::$loc\n";
+ }
+}
+
+sub log_print {
+ return if not defined $::opt_D;
+ my $dir = shift;
+
+ open OF, ">>$dir/uk_bbc.log";
+ print OF @_;
+ close OF;
+}
+
+sub FindLoc {
+ my ($locid, $dir, $timeout, $logdir) = @_;
+
+ my $url = "http://www.bbc.co.uk/weather/$locid";
+
+ my $file = "$locid.html";
+ getCachedHTML($url, $dir, $file, $timeout, $logdir);
+
+ my $cachefile = "$dir/$file";
+
+ open OF, "<:utf8", $cachefile;
+ my $contents = do { local $/; <OF>; };
+ close OF;
+
+ my ($rssid) = ($contents =~ /data-loc="(.*?)"/);
+ die "No RSS Location found for ID $locid!\n" unless defined $rssid;
+
+ $rssid =~ s/^LOC-//;
+ return $rssid;
+}
+
+
+sub getCachedHTML {
+ my ($url, $dir, $file, $timeout, $logdir) = @_;
+
+ my $cachefile = "$dir/$file";
+
+ my $now = time();
+
+ log_print( $logdir, "Loading URL: $url\n" );
+
+ if( (-e $cachefile) and ((stat($cachefile))[9] >= ($now - $timeout)) ) {
+ # File cache is still recent.
+ log_print( $logdir, "cached in $cachefile\n" );
+ } else {
+ log_print( $logdir, "$url\ncaching to $cachefile\n" );
+
+ my $ua = LWP::UserAgent->new;
+ $ua->timeout(30);
+ $ua->env_proxy;
+ $ua->default_header('Accept-Language' => "en");
+
+ my $response = $ua->get($url);
+ if ( !$response->is_success ) {
+ die $response->status_line;
+ }
+
+ my $content = $response->content;
+ open OF, ">:utf8", $cachefile or die "Can't open $cachefile: $!\n";
+ print OF $content;
+ close OF;
+ }
+}
+
+
1;
diff --git a/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbccurrentxml.pl b/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbccurrentxml.pl
index 45fd8c5..348a394 100755
--- a/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbccurrentxml.pl
+++ b/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbccurrentxml.pl
@@ -1,13 +1,13 @@
#! /usr/bin/perl
-
-#
-# Based on nwsxml.pl by Lucien Dunning
-#
+# vim:ts=4:sw=4:ai:et:si:sts=4
use strict;
use warnings;
+use utf8;
+use encoding 'utf8';
use English;
+
use File::Basename;
use Cwd 'abs_path';
use lib dirname(abs_path($0 or $PROGRAM_NAME)),
@@ -16,47 +16,61 @@ use lib dirname(abs_path($0 or $PROGRAM_NAME)),
use XML::Simple;
use LWP::Simple;
-# Ideally we would use the If-Modified-Since header
-# to reduce server load, but they ignore it
-#use HTTP::Cache::Transparent;
use Getopt::Std;
+use File::Path;
+
use File::Basename;
use lib dirname($0);
use BBCLocation;
-our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d);
+our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d, $opt_D);
my $name = 'BBC-Current-XML';
-my $version = 0.2;
-my $author = 'Stuart Morgan';
-my $email = 'stuart@tase.co.uk';
+my $version = 0.3;
+my $author = 'Gavin Hurlbut / Stuart Morgan';
+my $email = 'gjhurlbu@gmail.com / stuart@tase.co.uk';
my $updateTimeout = 120*60;
# 2 Hours, BBC updates infrequently ~3 hours
-# Given that the option to update in the background now exists
-# potentially we could be hitting the server 12 times in a day
my $retrieveTimeout = 30;
my @types = ('cclocation', 'station_id', 'copyright',
'observation_time', 'weather', 'temp', 'relative_humidity',
'wind_dir', 'pressure', 'visibility', 'weather_icon',
'appt', 'wind_spdgst');
-my $dir = "./";
+my $dir = "/tmp/uk_bbc";
+my $logdir = "/tmp/uk_bbc";
+
+binmode(STDOUT, ":utf8");
+
+if (!-d $logdir) {
+ mkpath( $logdir, {mode => 0755} );
+}
getopts('Tvtlu:d:');
if (defined $opt_v) {
print "$name,$version,$author,$email\n";
+ log_print( $logdir, "-v\n" );
exit 0;
}
if (defined $opt_T) {
print "$updateTimeout,$retrieveTimeout\n";
+ log_print( $logdir, "-t\n" );
exit 0;
}
-if (defined $opt_l) {
+if (defined $opt_d) {
+ $dir = $opt_d;
+}
+if (!-d $dir) {
+ mkpath( $dir, {mode => 0755} );
+}
+
+if (defined $opt_l) {
my $search = shift;
- my @results = BBCLocation::Search($search);
+ log_print( $logdir, "-l $search\n" );
+ my @results = BBCLocation::Search($search, $dir, $updateTimeout, $logdir);
my $result;
foreach (@results) {
@@ -71,13 +85,9 @@ if (defined $opt_t) {
exit 0;
}
-if (defined $opt_d) {
- $dir = $opt_d;
-}
-
# we get here, we're doing an actual retrieval, everything must be defined
-my $locid = shift;
+my $locid = BBCLocation::FindLoc(shift, $dir, $updateTimeout, $logdir);
if (!(defined $opt_u && defined $locid && !$locid eq "")) {
die "Invalid usage";
}
@@ -248,3 +258,12 @@ foreach (@data) {
printf $datalabel . "::" . $datavalue . "\n";
}
+
+sub log_print {
+ return if not defined $::opt_D;
+ my $dir = shift;
+
+ open OF, ">>$dir/uk_bbc.log";
+ print OF @_;
+ close OF;
+}
diff --git a/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbcthreedayxml.pl b/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbcthreedayxml.pl
index 1998c00..a966cb3 100755
--- a/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbcthreedayxml.pl
+++ b/mythplugins/mythweather/mythweather/scripts/uk_bbc/bbcthreedayxml.pl
@@ -1,13 +1,13 @@
#! /usr/bin/perl
-
-#
-# Based on nwsxml.pl by Lucien Dunning
-#
+# vim:ts=4:sw=4:ai:et:si:sts=4
use strict;
use warnings;
+use utf8;
+use encoding 'utf8';
use English;
+
use File::Basename;
use Cwd 'abs_path';
use lib dirname(abs_path($0 or $PROGRAM_NAME)),
@@ -16,44 +16,60 @@ use lib dirname(abs_path($0 or $PROGRAM_NAME)),
use XML::Simple;
use LWP::Simple;
-# Ideally we would use the If-Modified-Since header
-# to reduce server load, but they ignore it
-#use HTTP::Cache::Transparent;
use Getopt::Std;
+use File::Path;
+
use File::Basename;
use lib dirname($0);
use BBCLocation;
-our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d);
+our ($opt_v, $opt_t, $opt_T, $opt_l, $opt_u, $opt_d, $opt_D);
my $name = 'BBC-3day-XML';
-my $version = 0.2;
-my $author = 'Stuart Morgan';
-my $email = 'stuart@tase.co.uk';
+my $version = 0.3;
+my $author = 'Gavin Hurlbut / Stuart Morgan';
+my $email = 'gjhurlbu@gmail.com / stuart@tase.co.uk';
my $updateTimeout = 360*60; # 6 Hours
my $retrieveTimeout = 30;
my @types = ('3dlocation', 'station_id', 'copyright', 'weather_icon',
'date-0', 'icon-0', 'low-0', 'high-0',
'date-1', 'icon-1', 'low-1', 'high-1',
'date-2', 'icon-2', 'low-2', 'high-2', 'updatetime');
-my $dir = "./";
+my $dir = "/tmp/uk_bbc";
+my $logdir = "/tmp/uk_bbc";
+
+binmode(STDOUT, ":utf8");
+
+if (!-d $logdir) {
+ mkpath( $logdir, {mode => 0755} );
+}
getopts('Tvtlu:d:');
if (defined $opt_v) {
print "$name,$version,$author,$email\n";
+ log_print( $logdir, "-v\n" );
exit 0;
}
if (defined $opt_T) {
print "$updateTimeout,$retrieveTimeout\n";
+ log_print( $logdir, "-t\n" );
exit 0;
}
-if (defined $opt_l) {
+if (defined $opt_d) {
+ $dir = $opt_d;
+}
+if (!-d $dir) {
+ mkpath( $dir, {mode => 0755} );
+}
+
+if (defined $opt_l) {
my $search = shift;
- my @results = BBCLocation::Search($search);
+ log_print( $logdir, "-l $search\n" );
+ my @results = BBCLocation::Search($search, $dir, $updateTimeout, $logdir);
my $result;
foreach (@results) {
@@ -68,13 +84,9 @@ if (defined $opt_t) {
exit 0;
}
-if (defined $opt_d) {
- $dir = $opt_d;
-}
-
# we get here, we're doing an actual retrieval, everything must be defined
-my $locid = shift;
+my $locid = BBCLocation::FindLoc(shift, $dir, $updateTimeout, $logdir);
if (!(defined $opt_u && defined $locid && !$locid eq "")) {
die "Invalid usage";
}
@@ -232,3 +244,12 @@ foreach $item (@{$xml->{channel}->{item}}) {
$i++;
}
+
+sub log_print {
+ return if not defined $::opt_D;
+ my $dir = shift;
+
+ open OF, ">>$dir/uk_bbc.log";
+ print OF @_;
+ close OF;
+}