#!/usr/bin/perl
# 元データのHTMLは utf-8 に変換されていること
# usage: ./html2xml.pl < hoge.html > hoge.xml

$flag = 0;
$body = 0;
$mesg = 0;
$eval = 0;

print "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n<evaluation>\n";

while (<>) {
  if ($_ =~ /\<\!-- BOUNDARY --\>/) {
    print "<item>\n";
    $flag = 1; next;
  }
  if ($_ =~ /crVotingButtons/) {
    $flag = 0; next;
  }

  if ($flag == 0) { next; }
  if ($pass > 0) { $pass--; next; }

  if ($_ =~ /([0-9]*,*[0-9]*) 人中、([0-9]*,*[0-9]*)人の方が/) { 
    $tot = $1;
    $agr = $2; 
    $tot =~ s/,//g; $agr =~ s/,//g;
    print "  <total>$tot</total>\n";
    print "  <agreed>$agr</agreed>\n";
    $eval = 1;
    next;
  }

  if ($_ =~ /5つ星のうち ([0-9]*)/) { 
    if ($eval == 0) { print "  <total>0</total>\n  <agreed>0</agreed>\n"; }
    print "  <star>$1</star>\n";
    $title = 1;
    next;
  }

  if ($title == 1 && $_ =~ /\<b\>(.*)\<\/b\>/) {
    print "  <title>$1</title>\n";
    $title = 0;
    $body = 1;
    next;
  }

  if ($_ =~ /レビュー対象商品:/ && $body == 1 && $mesg == 0 ) 
          { $pass = 2; $mesg = 1; print "  <text>\n"; next; } # レビュー対象商品以下2行飛ばして本文開始
  if ($_ eq "\n" && $body == 1 && $mesg == 1 ) { $body = 0; $flag = 0; $mesg = 0; $eval = 0; next; }
  if ($_ =~ /padding-top: 10px; clear: both; width:/ ) { print "  </text>\n</item>\n"; next; }

  $_ =~ s/\<br \/\>//g;
  $_ =~ s/\<p\>/ /g;
  $_ =~ s/\&/＆/g;
  if ($mesg == 1) { print $_; }

}

print "</evaluation>\n";
