1 | #!/usr/bin/perl |
---|
2 | # |
---|
3 | # This script parses the HTML of an Environment Canada weather forecast |
---|
4 | # page as returned from http://www.weatheroffice.gc.ca/forecast/city_e.html. |
---|
5 | # |
---|
6 | # TODO Not exactly sure how Environment Canada reports windchill. Looks |
---|
7 | # like they don't report it in the summer time. Using temperature |
---|
8 | # as a value for now. |
---|
9 | # |
---|
10 | # TODO Not sure how wind gust speed is reported. Gust is always blank |
---|
11 | # |
---|
12 | # TODO Environment Canada only reports 5 day forecasts. 6 day forecast |
---|
13 | # layout is used to report 5 day information. |
---|
14 | # |
---|
15 | # TODO Code is pretty messy right now, by no means an elegant solution |
---|
16 | # |
---|
17 | # This requires the HTML::Parser module. |
---|
18 | |
---|
19 | package ENVCANParser; |
---|
20 | use strict; |
---|
21 | use POSIX; |
---|
22 | use HTML::Parser (); |
---|
23 | |
---|
24 | our $VERSION = 0.1; |
---|
25 | |
---|
26 | my %results; |
---|
27 | my %inside; |
---|
28 | my $scratch; |
---|
29 | my %directions = ( N => "North", S => "South", E => "East", W => "West", |
---|
30 | NE => "Northeast", NW => "Northwest", SE => "Southeast", SW => "Southwest"); |
---|
31 | |
---|
32 | sub start_h { |
---|
33 | my ($tag, %attr) = @_; |
---|
34 | $inside{$tag} = 1; |
---|
35 | |
---|
36 | # Copy attributes |
---|
37 | $inside{"$tag-id"} = $attr{id} if defined $attr{id}; |
---|
38 | $inside{"$tag-class"} = $attr{class} if defined $attr{class}; |
---|
39 | $inside{"$tag-src"} = $attr{src} if defined $attr{src}; |
---|
40 | $inside{"$tag-longdesc"} = $attr{longdesc} if defined $attr{longdesc}; |
---|
41 | |
---|
42 | doIMG() if $inside{img}; |
---|
43 | } |
---|
44 | |
---|
45 | # End tag, clear hash |
---|
46 | sub end_h { |
---|
47 | my ($tag) = @_; |
---|
48 | undef $inside{$tag}; |
---|
49 | undef $inside{"$tag-id"}; |
---|
50 | undef $inside{"$tag-class"}; |
---|
51 | } |
---|
52 | |
---|
53 | sub text { |
---|
54 | |
---|
55 | if (defined $inside{title}) { |
---|
56 | $_[0] =~ s/\n//sg; |
---|
57 | $_[0] =~ /(.*),\s*(.*)- 5 Day Weather/; |
---|
58 | $results{'cclocation'} = "$1, $2"; |
---|
59 | $results{'3dlocation'} = "$1, $2"; |
---|
60 | $results{'6dlocation'} = "$1, $2"; |
---|
61 | } |
---|
62 | |
---|
63 | if ($inside{h3}) { |
---|
64 | if ($inside{'div-id'} eq "f1") { $results{'date-0'} = $_[0]; } |
---|
65 | if ($inside{'div-id'} eq "f2") { $results{'date-1'} = $_[0]; } |
---|
66 | if ($inside{'div-id'} eq "f3") { $results{'date-2'} = $_[0]; } |
---|
67 | if ($inside{'div-id'} eq "f4") { $results{'date-3'} = $_[0]; } |
---|
68 | if ($inside{'div-id'} eq "f5") { $results{'date-4'} = $_[0]; } |
---|
69 | if ($inside{'div-id'} eq "f6") { $results{'date-5'} = $_[0]; } |
---|
70 | } |
---|
71 | |
---|
72 | if ($inside{li}) { |
---|
73 | if ($inside{'li-class'} eq "low") { |
---|
74 | if ($inside{'div-id'} eq "f1") { $_[0] =~ /\w* (\d*)/; $results{'low-0'} = $1; } |
---|
75 | if ($inside{'div-id'} eq "f2") { $_[0] =~ /\w* (\d*)/; $results{'low-1'} = $1; } |
---|
76 | if ($inside{'div-id'} eq "f3") { $_[0] =~ /\w* (\d*)/; $results{'low-2'} = $1; } |
---|
77 | if ($inside{'div-id'} eq "f4") { $_[0] =~ /\w* (\d*)/; $results{'low-3'} = $1; } |
---|
78 | if ($inside{'div-id'} eq "f5") { $_[0] =~ /\w* (\d*)/; $results{'low-4'} = $1; } |
---|
79 | if ($inside{'div-id'} eq "f6") { $_[0] =~ /\w* (\d*)/; $results{'low-5'} = $1; } |
---|
80 | } |
---|
81 | |
---|
82 | if ($inside{'li-class'} eq "high") { |
---|
83 | if ($inside{'div-id'} eq "f1") { $_[0] =~ /\w* (\d*)/; $results{'high-0'} = $1; } |
---|
84 | if ($inside{'div-id'} eq "f2") { $_[0] =~ /\w* (\d*)/; $results{'high-1'} = $1; } |
---|
85 | if ($inside{'div-id'} eq "f3") { $_[0] =~ /\w* (\d*)/; $results{'high-2'} = $1; } |
---|
86 | if ($inside{'div-id'} eq "f4") { $_[0] =~ /\w* (\d*)/; $results{'high-3'} = $1; } |
---|
87 | if ($inside{'div-id'} eq "f5") { $_[0] =~ /\w* (\d*)/; $results{'high-4'} = $1; } |
---|
88 | if ($inside{'div-id'} eq "f6") { $_[0] =~ /\w* (\d*)/; $results{'high-5'} = $1; } |
---|
89 | } |
---|
90 | } |
---|
91 | |
---|
92 | if ($inside{div}) { |
---|
93 | if ($inside{'div-class'} eq "citycondition") { $results{'weather'} = $_[0]; } |
---|
94 | |
---|
95 | if ($inside{'div-id'} eq "cityobserved") { |
---|
96 | $_[0] =~ /.* (\d*\:\d*.*)/; |
---|
97 | $results{'observation_time'} = "Last updated at $1"; |
---|
98 | $results{'updatetime'} = "Last updated at $1"; |
---|
99 | $results{'observation_time_rfc822'} = rfc822($1); |
---|
100 | } |
---|
101 | } |
---|
102 | |
---|
103 | if ($inside{dt}) { |
---|
104 | if ($_[0] =~ /(Temperature)/) { $scratch = 1; return; } |
---|
105 | if ($_[0] =~ /(Pressure)\/ Tendency/) { $scratch = 2; return; } |
---|
106 | if ($_[0] =~ /(Visibility)/) { $scratch = 3; return; } |
---|
107 | if ($_[0] =~ /(Humidity)/) { $scratch = 4; return; } |
---|
108 | if ($_[0] =~ /(Dewpoint)/) { $scratch = 5; return; } |
---|
109 | if ($_[0] =~ /(Wind)/) { $scratch = 6; return; } |
---|
110 | } |
---|
111 | |
---|
112 | if ($inside{dd}) { |
---|
113 | if ($scratch == 1) { $_[0] =~ /(\d*).*/; $results{'temp'} = $1; $results{'windchill'} = $1; $results{'appt'} = $1; } |
---|
114 | if ($scratch == 2) { $_[0] =~ /(\d*\.\d+) kPa.*/; $results{'pressure'} = $1 * 10; } |
---|
115 | if ($scratch == 3) { $_[0] =~ /(\d*) km/; $results{'visibility'} = $1; } |
---|
116 | if ($scratch == 4) { $_[0] =~ /(\d*) \%/; $results{'relative_humidity'} = $1; } |
---|
117 | if ($scratch == 5) { $_[0] =~ /(\d*).*/; $results{'dewpoint'} = $1; } |
---|
118 | if ($scratch == 6) { |
---|
119 | $_[0] =~ /.?(\w+) (\d+) km\/h/; |
---|
120 | $results{'wind_dir'} = $directions{$1}; |
---|
121 | $results{'wind_speed'} = $2; |
---|
122 | $results{'wind_spdgst'} = $2; |
---|
123 | } |
---|
124 | |
---|
125 | $scratch = 0; |
---|
126 | } |
---|
127 | |
---|
128 | } |
---|
129 | |
---|
130 | sub doIMG { |
---|
131 | my $icon; |
---|
132 | |
---|
133 | # Get Icon |
---|
134 | if ($inside{'img-src'} =~ /\/weathericons\/(\d*\.gif)/) { |
---|
135 | $icon = $1; |
---|
136 | open(FH, "ENVCAN_icons") or die "Cannot open icons"; |
---|
137 | while (my $line = <FH>) { |
---|
138 | chomp $line; |
---|
139 | if ($line =~ /$icon\:\:(.*)/) { |
---|
140 | $icon = $1; |
---|
141 | last; |
---|
142 | } |
---|
143 | } |
---|
144 | close (FH); |
---|
145 | } |
---|
146 | |
---|
147 | # Current conditions |
---|
148 | $results{"weather_icon"} = $icon if ($inside{'img-class'} eq "currentimg"); |
---|
149 | $results{"icon-0"} = $icon if ($inside{'img-longdesc'} eq "#f1"); |
---|
150 | $results{"icon-1"} = $icon if ($inside{'img-longdesc'} eq "#f2"); |
---|
151 | $results{"icon-2"} = $icon if ($inside{'img-longdesc'} eq "#f3"); |
---|
152 | $results{"icon-3"} = $icon if ($inside{'img-longdesc'} eq "#f4"); |
---|
153 | $results{"icon-4"} = $icon if ($inside{'img-longdesc'} eq "#f5"); |
---|
154 | $results{"icon-5"} = $icon if ($inside{'img-longdesc'} eq "#f6"); |
---|
155 | |
---|
156 | undef ($inside{'img-class'}); |
---|
157 | undef ($inside{'img-src'}); |
---|
158 | undef ($inside{'img-longdesc'}); |
---|
159 | undef ($inside{'img'}); |
---|
160 | } |
---|
161 | |
---|
162 | sub rfc822 { |
---|
163 | my ($string) = @_; |
---|
164 | |
---|
165 | if ($string =~ /(\d*):(\d*) (AM|PM) (...) \w* (\d*) (\w*) (\d*)/) { |
---|
166 | my $hour = int($1) - 1; |
---|
167 | my $min = int($2); |
---|
168 | my $ampm = $3; |
---|
169 | my $tzone = $4; |
---|
170 | my $day = $5; |
---|
171 | my $month = $6; |
---|
172 | my $year = $7; |
---|
173 | |
---|
174 | if ($ampm eq "PM") { if (int($hour) < 11) { $hour += 12; } } |
---|
175 | $month = 0 if $month eq "January"; |
---|
176 | $month = 1 if $month eq "February"; |
---|
177 | $month = 2 if $month eq "March"; |
---|
178 | $month = 3 if $month eq "April"; |
---|
179 | $month = 4 if $month eq "May"; |
---|
180 | $month = 5 if $month eq "June"; |
---|
181 | $month = 6 if $month eq "July"; |
---|
182 | $month = 7 if $month eq "August"; |
---|
183 | $month = 8 if $month eq "September"; |
---|
184 | $month = 9 if $month eq "October"; |
---|
185 | $month = 10 if $month eq "November"; |
---|
186 | $month = 11 if $month eq "December"; |
---|
187 | $year = int($year) - 1900; |
---|
188 | |
---|
189 | my $time_t = POSIX::mktime(0, $min, $hour, $day, $month, $year); |
---|
190 | my $now_string = localtime($time_t); |
---|
191 | |
---|
192 | return $now_string; |
---|
193 | } |
---|
194 | |
---|
195 | return ""; |
---|
196 | |
---|
197 | } |
---|
198 | |
---|
199 | sub doParse { |
---|
200 | |
---|
201 | my ($data, @types) = @_; |
---|
202 | |
---|
203 | # Initialize results hash |
---|
204 | foreach my $type (@types) { $results{$type} = ""; } |
---|
205 | |
---|
206 | my $p = HTML::Parser->new(api_version => 3); |
---|
207 | $p->unbroken_text(1); |
---|
208 | $p->report_tags(qw(div dd dt h2 h3 img li p title)); |
---|
209 | $p->ignore_elements(qw(style script)); |
---|
210 | $p->handler( start => \&start_h, 'tagname, @attr'); |
---|
211 | $p->handler( end => \&end_h, "tagname"); |
---|
212 | $p->handler( text => \&text, "dtext"); |
---|
213 | $p->parse($data) || die $!; |
---|
214 | |
---|
215 | return %results; |
---|
216 | } |
---|
217 | |
---|
218 | 1 |
---|