Ticket #9074: allocine.pl

File allocine.pl, 16.8 KB (added by Alexandra Lepercq <alexandra@…>, 10 years ago)

allocine script

Line 
1#!/usr/bin/perl -w
2
3#
4# This perl script is intended to perform movie data lookups in french based on
5# the www.allocine.fr website
6#
7# For more information on MythVideo's external movie lookup mechanism, see
8# the README file in this directory.
9#
10# Original author: Xavier Hervy (maxpower44 AT tiscali DOT fr)
11
12# changes:
13#   20-10-2009: Geoffroy Geerseau ( http://www.soslinux.net : jamdess AT soslinux DOT net )
14#   Modified for the new allocine templates
15#   25-10-2009: Geoffroy Geerseau ( http://www.soslinux.net : jamdess AT soslinux DOT net )
16#   Poster download correction
17#   Userrating correction
18#   28-10-2009: Robert McNamara (Myth Dev)
19#   Fix issues in above patches-- files should never be downloaded to /tmp.
20#   Convert script to output in new grabber output format for .23.  Leave backwards compat.
21#   02-11-2009: Geoffroy Geerseau
22#   Allocine have, once again, change their templates...
23#   06-08-2010: Alexandra Lepercq
24#   Allocine have, once again, change their templates...
25#   Add some data from api.allocine.fr (thanks to http://wiki.gromez.fr/dev/api/allocine)
26#       http://api.allocine.fr/xml/movie?code=$movieid&partner=3
27
28use File::Basename;
29use File::Copy;
30use lib dirname($0);
31use Encode;
32use utf8;
33use Encode 'from_to';
34use MythTV::MythVideoCommon;
35
36use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_l $opt_M $opt_P $opt_originaltitle $opt_casting $opt_u_dummy);
37use Getopt::Long;
38
39$title = "Allocine Query";
40$version = "v2.06";
41$author = "Xavier Hervy";
42push(@MythTV::MythVideoCommon::URL_get_extras, ($title, $version));
43
44binmode(STDOUT, ":utf8");
45
46# display usage
47sub usage {
48   print "usage: $0 -hviocMPD [parameters]\n";
49   print "       -h, --help                       help\n";
50   print "       -v, --version                    display version\n";
51   print "       -i, --info                       display info\n";
52   print "       -o, --originaltitle              concatenate title and original title\n";
53   print "       -c, --casting                    with -D option, grap the complete actor list (much slower)\n";
54   print "\n";
55   print "       -M <query>,   --movie query>     get movie list\n";
56   print "       -D <movieid>, --data <movieid>   get movie data\n";
57   print "       -P <movieid>, --poster <movieid> get movie poster\n";
58   exit(-1);
59}
60
61# display 1-line of info that describes the version of the program
62sub version {
63   print "$title ($version) by $author\n"
64}
65
66# display 1-line of info that can describe the type of query used
67sub info {
68   print "Performs queries using the www.allocine.fr website.\n";
69}
70
71# display detailed help
72sub help {
73   version();
74   info();
75   usage();
76}
77
78# returns text within 'data' without tag
79sub removeTag {
80   my ($data)=@_; # grab parameters
81
82   my $ldata = lc($data);
83   my $start = index($ldata, "<");
84   my $finish = index($ldata, ">", $start)+1;
85   while ($start != -1 && $finish != -1){
86      $data = substr($data, 0, $start).substr($data, $finish, length($data));
87      $ldata = lc($data);
88      $start = index($ldata, "<");
89      $finish = index($ldata, ">", $start)+1;
90   }
91   return $data;
92}
93
94
95# get Movie Data
96sub getMovieData {
97   my ($movieid)=@_; # grab movieid parameter
98   if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
99
100   # get Movie MetaData from api.allocine
101   $requestAPI = "http://api.allocine.fr/xml/movie?code=$movieid&partner=3";
102   $responseAPI = myth_url_get($requestAPI);
103   from_to($responseAPI,'utf-8','iso-8859-1');
104
105
106   # get the search results  page
107   my $request = "http://www.allocine.fr/film/fichefilm_gen_cfilm=" . $movieid . ".html";
108   my $allocineurl = $request;
109   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
110   my ($rc, $response) = myth_url_get($request);
111   from_to($response,'utf-8','iso-8859-1');
112
113
114   # parse Title and Year
115#   my $title = parseBetween($response, "<title>", "</title>");
116#   $title =~ s/\s*-\s*AlloCin.*//;
117#   $title =~ s/(.*)\(.*$/$1/;
118#   $title =~ s/^\s*(.*)\s*$/$1/;
119#   my $original_title = parseBetween($response, "Titre original :","<br");
120#   $original_title = trim(removeTag($original_title));
121#   if (defined $opt_originaltitle){
122#      if ($original_title ne  ""){
123#        $title = $title . " (" . $original_title . ")";
124#      }
125#   }
126#   $title = removeTag($title);
127#   my $year = parseBetween(parseBetween($response,"/film/tous/decennie","/a>"),'>','<');
128   my $titleApi = parseBetween($responseAPI,"<title>","</title>");
129   my $originaltitleApi = parseBetween($responseAPI,"<originalTitle>","</originalTitle>");
130   my $yearApi = parseBetween($responseAPI,"<productionYear>","</productionYear>");
131
132
133   # parse Director
134#   my $tempresponse = $response;
135#   my $director = parseBetween($tempresponse,"Réalisé par ","</a></span>");
136#   $director = removeTag($director);
137
138#   my $directorApi = parseBetween($responseAPI,"<directors>","</directors>");
139
140
141   # parse Plot
142#   my $plot = parseBetween($response,"Synopsis : </span>","</p>");
143#   $plot =~ s/\n//g;
144#   $plot = trim(removeTag($plot));
145   my $plotApi = parseBetween($responseAPI,"<synopsis>","</synopsis>");
146
147 
148   # parse User Rating
149#   my $userrating=0;
150#   my $tmpratings = parseBetween(parseBetween($response,"/film/critiquepublic_gen_cfilm=$movieid.html\"><img", "</span></p></div>"),'(',')');
151#   $tmpratings =~ s/,/./gm;
152#   if($tmpratings =~ /^(\d+\.?\d*|\.\d+)$/ && !$tmpratings eq "")
153#   {   
154#       $userrating = int($tmpratings*2.5);
155#   }
156#   else
157#   {
158#       $userrating =  "";
159#   }
160   my $userratingOrig = parseBetween($responseAPI,"<userRating>","</userRating>");
161   $userratingApi = int($userratingOrig * 2.5);
162
163
164   # parse Rating
165   my $movierating = parseBetween($response,"Interdit aux moins de ","ans");
166   if (!($movierating eq ""))
167        { $movierating = "Interdit -" . $movierating . "ans";}
168   else
169        {
170                $movierating = parseBetween($response,"Visible ","enfants");
171                if (!($movierating eq "")){ $movierating = "Enfants";};
172        }
173
174   my $movieratingTout = parseBetween($responseAPI,"<ratingStats>","</ratingStats>");
175
176
177   # parse Movie length
178#   my $runtime = trim(parseBetween($response,"Durée :","min"));
179#   my $heure;
180#   my $minutes;
181#   ($heure,$minutes)=($runtime=~/[^\d]*(\d+)[^\d]*(\d*)/);
182#   if (!$heure){ $heure = 0; }
183#   if (!$minutes){
184#      $runtime = $heure * 60;
185#   }else{
186#       $runtime = $heure * 60 + $minutes;
187#   }
188   my $runtimeOrig = parseBetween($responseAPI,"<runtime>","</runtime>");
189   $runtimeApi = $runtimeOrig / 60;
190
191
192   # parse Cast
193#   my $castchunk;
194#   $castchunk = parseBetween($response, "      Avec ","<a href=\"/film/casting_gen_cfilm=$movieid.html\" >plus</a>"); 
195#   my $cast = "";
196#   $cast = trim(join(',', removeTag($castchunk)));
197   my $castApi = parseBetween($responseAPI,"<casting>","</casting>");
198   $castApi =~ s!<castMember>!\n!g;
199   $castApi =~ s!</castMember>!/>!g;
200   $castApi =~ s/person code/person name/g;
201   $castApi =~ s!</person>!"!g;
202   $castApi =~ s/<activity code/ job/g;
203   $castApi =~ s!</activity>!"!g;
204   $castApi =~ s/<role>/ character="/g;
205   $castApi =~ s!</role>!"!g;
206   $castApi =~ s![0-9]!!g;
207   $castApi =~ s!">!!g;
208   $castApi =~ s/<picture href/ picture/g;
209   $castApi =~ s!</picture>!"!g;
210   $castApi =~ s!picture="http://images.allocine.fr/medias/nmedia/////.jpg"!!g;
211   $castApi =~ s!Réalisateur!director!g;
212   $castApi =~ s!Acteur!actor!g;
213   $castApi =~ s!Producteur!producer!g;
214   $castApi =~ s!Compositeur!composer!g;
215
216
217   #Genres
218#   my $genres = parseBetween($response,"Genre :","<br");
219#   $genres =~ s/\s*\n*(.*)\s*$/ $1/;
220#   $genres = trim(removeTag($genres));
221#   $genres =~ s/\s*\n*(.*)\s*$/ $1/;
222   my $genreApi = parseBetween($responseAPI,"<genreList>","</genreList>");
223   $genreApi =~ s/genre code/category name/g;
224   $genreApi =~ s!</genre>!"/>\n!g;
225   $genreApi =~ s![0-9]!!g;
226   $genreApi =~ s!">!!g;
227#   $genreApi =~ s!Musical!Comédie musicale!g;
228#   $genreApi =~ s!Action!Aventure, Action!g;
229#   $genreApi =~ s!Aventure!!g;
230
231
232   #Countries
233#   my $countries = parseBetween($response,"Long-métrage",".");
234#   $countries = trim(removeTag($countries));
235#   $countries =~ s/\s*(.*)\s*$/ $1/;
236#   $countries = trim($countries);
237#   $countries =~ s/\n//gm;
238#   $countries =~ s/\s//gm;
239#   $countries =~ s/,/, /gm;
240#   if ($countries eq "allemand") { $countries = "Allemagne"; }
241#   if ($countries eq "américain") { $countries = "États-Unis"; }
242#   if ($countries eq "autrichien") { $countries = "Autriche"; }
243#   if ($countries eq "britannique") { $countries = "Royaume Uni"; }
244#   if ($countries eq "canadien") { $countries = "Canada"; }
245#   if ($countries eq "français") { $countries = "France"; }
246#   if ($countries eq "italien") { $countries = "Italie"; }
247#   if ($countries eq "russe") { $countries = "Russie"; }
248   my $countryApi = parseBetween($responseAPI,"<nationalityList>","</nationalityList>");
249   $countryApi =~ s/nationality code/country name/g;
250   $countryApi =~ s!</nationality>!"/>\n!g;
251   $countryApi =~ s![0-9]!!g;
252   $countryApi =~ s!">!!g;
253#   $countryApi =~ s!Grande-Bretagne!Royaume Uni!g;
254#   $countryApi =~ s!U.S.A.!États-Unis!g;
255
256
257   # parse for Coverart
258#   my $mediafile = parseBetween($response,"<a href=\"/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=","\" >");
259#   $covrequest = "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=".$mediafile;
260#   ($rc, $covresponse) = myth_url_get($covrequest);
261#   my $uri = parseBetween(parseBetween($covresponse,"<div class=\"tac\" style=\"\">","</div>"),"<img src=\"","\" alt");
262   $request = "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
263   ($rc, $response) = myth_url_get($request);
264   my $mediafile = parseBetween($response,"<a href=\"/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=","\" >");
265   $request2 = "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=".$mediafile;
266   ($rc, $response2) = myth_url_get($request2);
267   $uri = trim(parseBetween($response2,"<a Target=\"_blank\" Class=\"fs11\" href=\"","\">Agrandir</a>"));
268   if ($uri eq "")
269   {
270        $request = "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
271        ($rc, $response) = myth_url_get($request);
272        my $tmp_uri = parseBetween($response, "<a href=\"/film/fichefilm-".$movieid."/affiches/\">"," alt=");
273        $tmp_uri =~ s/\n/ /gm;
274        $uri = trim(parseBetween($tmp_uri,"<img src='h","'"));
275        if($uri ne "")
276        {
277                $uri = "h$uri";
278        }
279   }
280   # if no picture was found, just download the empty poster
281   if($uri eq ""){
282        $uri = "http://images.allocine.fr/r_160_214/commons/emptymedia/AffichetteAllocine.gif";
283   }
284
285
286   # output fields (these field names must match what MythVideo is looking for)
287#   print "Title:$title\n";
288#   if (!(defined $opt_originaltitle)){
289#    print "OriginalTitle:$original_title\n";
290#   }
291#   print "URL:$allocineurl\n";
292#   print "Year:$year\n";
293#   print "Director:$director\n";
294#   print "Plot:$plot\n";
295#   print "UserRating:$userrating\n";
296#   print "MovieRating:$movierating\n";
297#   print "Runtime:$runtime\n";
298#   print "Cast:$cast\n";
299#   print "Genres:$genres\n";
300#   print "Countries:$countries\n";
301#   print "Coverart: $uri\n";
302
303#   print "\n";
304#   print "OriginaltitleApi:$originaltitleApi\n";
305#   print "MovieratingTout:$movieratingTout\n";
306#   print "\n";
307
308
309
310   # MetaData output
311print "<?xml version='1.0' encoding='UTF-8'?>\n";
312print "<metadata>\n";
313  print "<item>\n";
314    print "<inetref>$movieid</inetref>\n";
315    print "<title>$titleApi</title>\n";
316    print "<language>fr</language>\n";
317    print "<description>$plotApi</description>\n";
318    print "<countries>\n";
319#      print "<country name=\"$countries\"/>\n";
320      print "$countryApi";
321    print "</countries>\n";
322    print "<categories>\n";
323#      print "<category name=\"$genres\"/>\n";
324      print "$genreApi";
325    print "</categories>\n";
326    print "<userrating>$userratingApi</userrating>\n";
327#    print "<movierating>$movierating</movierating>\n";
328    print "<year>$yearApi</year>\n";
329    print "<runtime>$runtimeApi</runtime>\n";
330    print "<homepage>$allocineurl</homepage>\n";
331#    print "<trailerURL>$bandeannonceurl</trailerURL>\n";
332    print "<people>";
333#      print "<person name=\"$director\" job=\"Director\"/>\n";
334#      print "<person name=\"$cast\" job=\"Actor\"/>\n";
335      print "$castApi";
336    print "</people>\n";
337    print "<images>\n";
338      print "<image type=\"coverart\" url=\"$uri\"/>\n";
339#      print "<image type=\"fanart\" url=\"$fanarturi\"/>\n";
340#      print "<image type=\"screenshot\" url=\"$screenshoturi\"/>\n";
341#      print "<image type=\"banner\" url=\"$banneruri\"/>\n";
342    print "</images>\n";
343  print "</item>\n";
344print "</metadata>\n";
345
346
347
348}
349
350# dump Movie Poster
351sub getMoviePoster {
352   my ($movieid)=@_; # grab movieid parameter
353   if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
354
355   # get the search results  page
356   
357   my $request = "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
358   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
359   my ($rc, $response) = myth_url_get($request);
360   my $mediafile = parseBetween($response,"<a href=\"/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=","\" >");
361
362   $request = "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/detail/?cmediafile=".$mediafile;
363   ($rc, $response) = myth_url_get($request);
364   my $uri = parseBetween(parseBetween($response,"<div class=\"tac\" style=\"\">","</div>"),"<img src=\"","\" alt");
365   if ($uri eq "")
366   {
367        $request = "http://www.allocine.fr/film/fichefilm-".$movieid."/affiches/";
368        ($rc, $response) = myth_url_get($request);
369        my $tmp_uri = parseBetween($response, "<a href=\"/film/fichefilm-".$movieid."/affiches/\">"," alt=");
370        $tmp_uri =~ s/\n/ /gm;
371        $uri = trim(parseBetween($tmp_uri,"<img src='h","'"));
372        if($uri ne "")
373        {
374                $uri = "h$uri";
375        }
376        print "$uri\n";
377   }
378   
379   # if no picture was found, just download the empty poster
380   if($uri eq ""){
381        $uri = "http://images.allocine.fr/r_160_214/commons/emptymedia/AffichetteAllocine.gif";
382   }
383
384   print "$uri\n";
385}
386
387sub getMovieList {
388        my ($filename, $options) = @_; # grab parameters
389
390        my $query = cleanTitleQuery($filename);
391        if (!$options) { $options = ""; }
392        if (defined $opt_d) {
393                printf("# query: '%s', options: '%s'\n", $query, $options);
394        }
395
396        # get the search results  page
397        my $request = "http://www.allocine.fr/recherche/1/?q=$query";
398        if (defined $opt_d) { printf("# request: '%s'\n", $request); }
399        my ($rc, $response) = myth_url_get($request);
400        from_to($response,'utf-8','iso-8859-1');
401        $response =~ s/\n//g;
402        # extract possible matches
403        #    possible matches are grouped in several catagories: 
404        #        exact, partial, and approximate
405        my $exact_matches = $response;
406        # parse movie list from matches
407        my $beg = "<div style=\"margin-top:-5px;\">";
408        my $end = "<span class=\"fs11\">";
409
410        my @movies;
411
412        my $data = $exact_matches;
413        if ($data eq "") {
414                if (defined $opt_d) { printf("# no results\n"); }
415        } else {
416                my $start = index($data, $beg);
417                my $finish = index($data, $end, $start);
418
419                my $title;
420                my $movienum;
421                my $moviename;
422                while ($start != -1) {
423                        $start += length($beg);
424                        my $sub1 = substr($data, $start, $finish - $start);
425                        $sub1 =~ s/(.*)\(.*$/$1/;
426                        $moviename = trim(removeTag($sub1));
427                        $movienum = parseBetween($sub1,"<a href='/film/fichefilm_gen_cfilm=",".html");
428                       
429                        $title = removeTag($moviename);
430                        $moviename = removeTag($moviename);
431                        my ($movieyear)= $moviename =~/\((\d+)\)/;
432                        if ($movieyear) {
433                                $title = $title." (".$movieyear.")";
434                        }
435                        $moviename=$title ;
436
437                        # advance data to next movie
438                        $data = substr($data, - (length($data) - $finish));
439                        $start = index($data, $beg);
440                        $finish = index($data, $end, $start);
441
442                        # add to array
443                        push(@movies, "$movienum:$moviename");
444                }
445
446                # display array of values
447                for $movie (@movies) {
448                        print "$movie\n";
449                }
450        }
451}
452
453#
454# Main Program
455#
456
457# parse command line arguments
458
459    GetOptions( "utf8" => \$opt_u_dummy,
460                "version" => \$opt_v,
461                "info" => \$opt_i,
462                "language" => \$opt_l,
463                "originaltitle" => \$opt_originaltitle,
464                "casting" => \$opt_casting,
465                "Data" => \$opt_D,
466                "Movie" => \$opt_M,
467                "Poster" => \$opt_P
468                );       
469           
470
471# print out info
472if (defined $opt_v) { version(); exit 1; }
473if (defined $opt_i) { info(); exit 1; }
474if (defined $opt_l) {
475    my $lang = shift;
476}
477
478# print out usage if needed
479if (defined $opt_h || $#ARGV<0) { help(); }
480
481if (defined $opt_D) {
482   # take movieid from cmdline arg
483   $movieid = shift || die "Usage : $0 -D <movieid>\n";
484   getMovieData($movieid);
485}
486
487elsif (defined $opt_P) {
488   # take movieid from cmdline arg
489   $movieid = shift || die "Usage : $0 -P <movieid>\n";
490   getMoviePoster($movieid);
491}
492
493elsif (defined $opt_M) {
494   # take query from cmdline arg
495   #$options = shift || die "Usage : $0 -M <query>\n";
496   my $query;
497   my $options = '';
498   foreach $key (0 .. $#ARGV) {
499        $query .= $ARGV[$key]. ' ';
500   }
501   getMovieList($query, $options);
502}
503# vim: set expandtab ts=3 sw=3 :