Ticket #4518: kinox.pl

File kinox.pl, 11.8 KB (added by Denys Dmytriyenko <denis@…>, 16 years ago)

The actual script

Line 
1#!/usr/bin/perl -w
2
3#
4# This perl script is intended to perform movie data lookups in russian
5# based on the www.kinox.ru website
6#
7# For more information on MythVideo's external movie lookup mechanism, see
8# the README file in this directory.
9#
10# Author: Denys Dmytriyenko (denis AT denix DOT org)
11# Based on the allocine script by Xavier Hervy
12#
13
14# Note:
15#  Encoding on the Web page is cp1251
16#  Internal encoding of this script is koi8-r
17#  The output of this script is in utf8 (set by "outcp" below)
18
19use LWP::Simple;      # libwww-perl providing simple HTML get actions
20use HTML::Entities;
21use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8);
22
23no encoding;
24
25use Encode;
26use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P $opt_originaltitle $opt_casting $opt_u_dummy);
27use Getopt::Long;
28
29$title = "KinoX Query";
30$version = "v0.03";
31$author = "Denys Dmytriyenko";
32
33# This is the output encoding
34$outcp = "utf8";
35
36# binmode() does not work for some reason
37# The output ends up being in the wrong encoding
38#binmode(STDOUT, ":utf8");
39
40# display usage
41sub usage {
42   print "usage: $0 -hviocMPD [parameters]\n";
43   print "       -h, --help                       help\n";
44   print "       -v, --version                    display version\n";
45   print "       -i, --info                       display info\n";
46   print "       -o, --originaltitle              concatenate title and original title\n";
47   print "       -c, --casting                    with -D option, grap the complete actor list (much slower)\n";
48   print "\n";
49   print "       -M <query>,   --movie query>     get movie list\n";
50   print "       -D <movieid>, --data <movieid>   get movie data\n";
51#   print "       -P <movieid>, --poster <movieid> get movie poster\n";
52   exit(-1);
53}
54
55# display 1-line of info that describes the version of the program
56sub version {
57   print "$title ($version) by $author\n"
58}
59
60# display 1-line of info that can describe the type of query used
61sub info {
62   print "Performs queries using the www.kinox.ru website.\n";
63}
64
65# display detailed help
66sub help {
67   version();
68   info();
69   usage();
70}
71
72# returns text within 'data' between 'beg' and 'end' matching strings
73sub parseBetween {
74   my ($data, $beg, $end)=@_; # grab parameters
75
76   my $ldata = lc($data);
77   my $start = index($ldata, lc($beg)) + length($beg);
78   my $finish = index($ldata, lc($end), $start);
79   
80
81   if ($start != (length($beg) -1) && $finish != -1) {
82        my $result = substr($data, $start, $finish - $start);
83        # dont use decode entities &npsp; => spécial characters bug in html::entities ?
84        #decode_entities($result);
85        return  removenbsp($result);
86   }
87   return "";
88}
89
90# use to replace &nbsp; by " " (instead of decode_entities)
91sub removenbsp {
92   my ($data)=@_; # grab parameters
93
94   my $ldata = lc($data);
95   my $start = index($ldata, "&nbsp;");
96   while ($start != -1){
97      $data = substr($data, 0, $start). " " .substr($data, $start+6, length($data));
98      $ldata = lc($data);
99      $start = index($ldata, "&nbsp;");
100   }
101   return $data;
102}
103
104
105# returns text within 'data' without tag
106sub removeTag {
107   my ($data)=@_; # grab parameters
108
109   my $ldata = lc($data);
110   my $start = index($ldata, "<");
111   my $finish = index($ldata, ">", $start)+1;
112   while ($start != -1 && $finish != -1){
113      $data = substr($data, 0, $start).substr($data, $finish, length($data));
114      $ldata = lc($data);
115      $start = index($ldata, "<");
116      $finish = index($ldata, ">", $start)+1;
117   }
118   return $data;
119}
120
121# get Movie Data
122sub getMovieData {
123   my ($movieid)=@_; # grab movieid parameter
124   if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
125
126   # get the search results  page
127   my $request = "http://www.kinox.ru/index.asp?comm=4&num=" . $movieid;
128   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
129   my $response = get $request;
130
131   # parse title and year
132   my $sub = parseBetween($response, "<h1>", "</h1>");
133   my ($sub1, $countries) = split("<br>", $sub);
134   
135   $countries = removeTag($countries);
136   $countries =~ s/[\n\r]/ /g;
137   Encode::from_to($countries, "windows-1251", $outcp);
138   
139   my ($title, $original_title) = split("<font size=4 color=#000000> / </font>", $sub1);
140   $title = removeTag($title);
141   if (!$original_title) { $original_title = "" ;}
142   $original_title = removeTag($original_title);
143
144   Encode::from_to($title, "windows-1251", $outcp);
145   Encode::from_to($original_title, "windows-1251", $outcp);
146
147   if (defined $opt_originaltitle){
148      if ($original_title  ne  ""){
149        $title = $title . " (" . $original_title . ")";
150      }
151   }
152   
153   # parse director
154   my $dirq = "<b>òÅÖÉÓÓÅÒ:</b>";
155   Encode::from_to($dirq, "koi8-r", "windows-1251");
156   my $director = parseBetween($response, $dirq, "</a>");
157   $director = removeTag($director);
158   $director =~ s/\s{2,}//;
159   Encode::from_to($director, "windows-1251", $outcp);
160
161   # parse plot
162   my $plotq = "<b>ëÒÁÔËÏÅ ÓÏÄÅÒÖÁÎÉÅ:</b>";
163   Encode::from_to($plotq, "koi8-r", "windows-1251");
164   my $plot = parseBetween($response, $plotq, "</p>");
165   $plot = removeTag($plot);
166   $plot =~ s/\s{2,}//;
167   Encode::from_to($plot, "windows-1251", $outcp);
168
169   # parse cast
170   my $castq = "<b>÷ ÒÏÌÑÈ:</b>";
171   Encode::from_to($castq, "koi8-r", "windows-1251");
172   my $cast = parseBetween($response, $castq, "<b>");
173   $cast = removeTag($cast);
174   $cast =~ s/\s{2,}//;
175   $cast =~ s/\s\(.*?\)//g;
176   $cast =~ s/\s*,\s*/,/g;
177   $cast =~ s/\.$//;
178   Encode::from_to($cast, "windows-1251", $outcp);
179
180   # studio, year, genres, runtime
181   $sub = parseBetween($response, "<td colspan=2 bgcolor=f8f8f8 align=\"center\" valign=\"top\">", "</td></tr></table>");
182   $sub =~ s/&nbsp;//g;
183   $sub =~ s/&nbsp//g;
184   Encode::from_to($sub, "windows-1251", $outcp);
185
186   my $beg = "<font color=\"#008000\">";
187   my $end = "</font>";
188
189   my $start = index($sub, $beg);
190   my $finish = index($sub, $end, $start);
191
192   $start += length($beg);
193   my $studio = substr($sub, $start, $finish - $start);
194   $studio = removeTag($studio);
195
196   $sub = substr($sub, - (length($sub) - $finish));
197
198   $start = index($sub, $beg);
199   $finish = index($sub, $end, $start);
200
201   $start += length($beg);
202   my $year = substr($sub, $start, $finish - $start);
203   $year = removeTag($year);
204
205   $sub = substr($sub, - (length($sub) - $finish));
206
207   $start = index($sub, $beg);
208   $finish = index($sub, $end, $start);
209
210   $start += length($beg);
211   my $genres = substr($sub, $start, $finish - $start);
212   $genres = removeTag($genres);
213   $genres =~ s|\s*/\s*|,|g;
214
215   $sub = substr($sub, - (length($sub) - $finish));
216
217   $start = index($sub, $beg);
218   $finish = index($sub, $end, $start);
219
220   $start += length($beg);
221   my $runtime = substr($sub, $start, $finish - $start);
222   $runtime = removeTag($runtime);
223
224   # output fields (these field names must match what MythVideo is looking for)
225   print "Title:$title\n";
226   if (!(defined $opt_originaltitle)){
227    print "OriginalTitle:$original_title\n";
228   } 
229   print "Year:$year\n";
230   print "Director:$director\n";
231   print "Plot:$plot\n";
232   print "Runtime:$runtime\n";
233   print "Cast:$cast\n";
234   print "Genres:$genres\n";
235   print "Countries:$countries\n";
236}
237
238# dump Movie list:  1 entry per line, each line as 'movieid:Movie Title'
239sub getMovieList {
240   my ($filename, $options)=@_; # grab parameters
241
242   # If we wanted to inspect the file for any reason we can do that now
243
244   #
245   # Convert filename into a query string
246   # (use same rules that Metadata::guesTitle does)
247   my $query = $filename;
248
249   $query = uri_unescape($query);  # in case it was escaped
250   # Strip off the file extension
251   if (rindex($query, '.') != -1) {
252      $query = substr($query, 0, rindex($query, '.'));
253   }
254   # Strip off anything following '(' - people use this for general comments
255   if (rindex($query, '(') != -1) {
256      $query = substr($query, 0, rindex($query, '('));
257   }
258   # Strip off anything following '[' - people use this for general comments
259   if (rindex($query, '[') != -1) {
260      $query = substr($query, 0, rindex($query, '['));
261   }
262   # Strip off anything following '-' - people use this for general comments
263   if (index($query, '-') != -1) {
264      $query = substr($query, 0, index($query, '-'));
265   }
266
267   # IMDB searches do better if any trailing ,The is left off
268   $query =~ /(.*), The$/i;
269   if ($1) { $query = $1; }
270   Encode::from_to($query, "koi8-r", "windows-1251");
271
272   # prepare the url
273   $query = uri_escape($query);
274   if (!$options) { $options = "" ;}
275   if (defined $opt_d) {
276      printf("# query: '%s', options: '%s'\n", $query, $options);
277   }
278
279   my $count = 0;
280   my $typerecherche = 3;
281 
282   while (($typerecherche <=5) && ($count ==0)){
283           # get the search results  page
284           my $request = "http://www.kinox.ru/index.asp?comm=1&fop=false&pack=0&kw=$query";
285           if (defined $opt_d) { printf("# request: '%s'\n", $request); }
286           my $response = get $request;
287           if (defined $opt_d) { printf("# response: '%s'\n", $response); }
288
289           #
290           # don't try to invent if it doesn't exist
291           #
292           my $notfnd = "ÎÉÞÅÇÏ ÎÅÂÙÌÏ ÎÁÊÄÅÎÏ";
293           Encode::from_to($notfnd, "koi8-r", "windows-1251");
294           return if $response =~ /$notfnd/;
295       
296           # extract possible matches
297           #    possible matches are grouped in several catagories: 
298           #        exact, partial, and approximate
299           my $exact_matches = $response;
300           # parse movie list from matches
301           my $beg = "<a class=l2 href=\"index.asp?comm=4&num=";
302           my $end = "</a>";
303           my $begy = "colspan=2 align=center>";
304           my $endy = "</td>";
305           
306           my @movies;
307       
308           my $data = $exact_matches;
309           if ($data eq "") {
310              if (defined $opt_d) { printf("# no results\n"); }
311                $typerecherche = $typerecherche +2 ;
312           }else{
313              my $start = index($data, $beg);
314              my $finish = index($data, $end, $start);
315           
316              my $title;
317              while ($start != -1) {
318                 $start += length($beg);
319                 my $sub = substr($data, $start, $finish - $start);
320                 my ($movienum, $moviename) = split("\">", $sub);
321                 $title = removeTag($moviename);
322                 $moviename = removeTag($moviename);
323
324                 $title =~ s/\s{2,}//;
325                 Encode::from_to($title, "windows-1251", $outcp);
326
327                 # advance data to next field
328                 $data = substr($data, - (length($data) - $finish));
329
330                 $start = index($data, $begy);
331                 $finish = index($data, $endy, $start);
332                 $start += length($begy);
333                 $sub = substr($data, $start, $finish - $start);
334                 my $movieyear = removeTag($sub);
335
336                 if ($movieyear){$title = $title." (".$movieyear.")"; }
337                 $moviename=$title ;
338
339                 # advance data to next movie
340                 $data = substr($data, - (length($data) - $finish));
341                 $start = index($data, $beg);
342                 $finish = index($data, $end, $start);
343             
344                 # add to array
345                 $movies[$count++] = $movienum . ":" . $moviename;
346              }
347             
348              # display array of values
349              for $movie (@movies) {
350                print "$movie\n";
351              }
352           }
353      }
354}
355
356#
357# Main Program
358#
359
360# parse command line arguments
361
362    GetOptions( "utf8" => \$opt_u_dummy,
363                "version" => \$opt_v,
364                "info" => \$opt_i,
365                "originaltitle" => \$opt_originaltitle,
366                "casting" => \$opt_casting,
367                "Data" => \$opt_D,
368                "Movie" => \$opt_M,
369                "Poster" => \$opt_P
370                );       
371           
372
373#$opt_d = 1;
374
375# print out info
376if (defined $opt_v) { version(); exit 1; }
377if (defined $opt_i) { info(); exit 1; }
378
379# print out usage if needed
380if (defined $opt_h || $#ARGV<0) { help(); }
381
382if (defined $opt_D) {
383   # take movieid from cmdline arg
384   $movieid = shift || die "Usage : $0 -D <movieid>\n";
385   getMovieData($movieid);
386}
387
388elsif (defined $opt_M) {
389   # take query from cmdline arg
390   #$options = shift || die "Usage : $0 -M <query>\n";
391   my $query;
392   my $options = '';
393   foreach $key (0 .. $#ARGV) {
394        $query .= $ARGV[$key]. ' ';
395   }
396   getMovieList($query, $options);
397}