second pass at new getxcatdocs - convert tables to pandoc, fix image downloads, and more

2014-09-09 09:30:14 -04:00
parent 40b5891c4a
commit 9dd06c1047
1 changed files with 273 additions and 91 deletions
--- a/xCAT-client/bin/getxcatdocs
+++ b/xCAT-client/bin/getxcatdocs
@@ -15,14 +15,16 @@ use strict;
 use Getopt::Long;
 use Cwd;
 use JSON;
+use List::Util qw[max];
+

 # URL for the xCAT Allura wiki API markdown on SourceForge
-my $sf_url='http://sourceforge.net/rest';
-my $wiki_url=$sf_url.'/p/xcat/wiki/';
+my $SF_URL='http://sourceforge.net/rest';
+my $WIKI_URL=$SF_URL.'/p/xcat/wiki/';

 # Update this list if you group any xcat docs on a separate page such that they
 # are no longer linked from the main doc page:
-my @indexdocs = ('XCAT_Documentation', 
+my @INDEXDOCS = ('XCAT_Documentation', 
                 'Power_775_Cluster_Documentation', 
                 'Highly_Available_Management_Node', 
                 'Mixed_Cluster_Support', 
@@ -33,16 +35,23 @@ my $HELP;
 my $UPLOAD;
 my $UPLOADONLY;
 my $IGNOREERRORS;
+my $CONTINUE;
 my $SINGLE_DOC;
 my $VERBOSE;

+my $MDDIR;
+my $HTMLDIR;
+my $PDFDIR;
+my $IMAGEDIR;
+my %LOADEDDOCS;
+
 sub verbose { if ($VERBOSE) { print shift, "\n"; } }

 my $usage = sub {
    my $exitcode = shift @_;
    print "Usage: getxcatdocs [-?|-h|--help] \n";
    print "Usage: getxcatdocs [-v|--verbose] [-u|--upload] [--uploadonly] [-i|--ignoreerrors] [<destination-dir>]\n";
-    print "Usage: getxcatdocs [-v|--verbose] [-d|--doc single_doc] [-i|--ignoreerrors] [<destination-dir>]\n";
+    print "Usage: getxcatdocs [-v|--verbose] [-c|--continue] [-d|--doc single_doc] [-i|--ignoreerrors] [<destination-dir>]\n";
    exit $exitcode;
 };

@@ -58,6 +67,7 @@ my $usage = sub {
         'v|verbose'   => \$VERBOSE, 
         'u|upload'    => \$UPLOAD, 
         'uploadonly'  => \$UPLOADONLY, 
+         'c|continue'    => \$CONTINUE,
         'i|ignoreerrors'    => \$IGNOREERRORS, 
         'd|doc=s'     => \$SINGLE_DOC ))
          { $usage->(1); }
@@ -66,35 +76,28 @@ my $usage = sub {

    if ($^O =~ /^aix/i) { die "Error: this command is not yet supported on AIX.\n"; }

-    my $destdir = scalar(@ARGV) ? $ARGV[0] : '.';
-    chdir($destdir) or die "Can not cd to $destdir: $!\n";
+    my $DESTDIR = scalar(@ARGV) ? $ARGV[0] : '.';
+    chdir($DESTDIR) or die "Can not cd to $DESTDIR: $!\n";

    my $json = JSON->new(); 

    if ($SINGLE_DOC) {
-      my $scurlcmd = "curl -X GET $wiki_url$SINGLE_DOC";
-      verbose($scurlcmd);
-      my $pagecontent = `$scurlcmd`;
-      if ($? && !$IGNOREERRORS) { die "error encountered in $scurlcmd \n";}
-      my $pageout = $json->decode($pagecontent);
-      foreach my $pageatt (@{$pageout->{attachments}}) {
-         my $swgetcmd = "wget $pageatt->{url}";
-         verbose($swgetcmd);
-         system($swgetcmd);
-         if ($? && !$IGNOREERRORS) { die "error encountered in $swgetcmd \n";}
-      }
-      convert_doc($SINGLE_DOC,$pageout->{text},'.','.','.','.');
+      $MDDIR = '.';
+      $HTMLDIR = '.';
+      $PDFDIR = '.';
+      $IMAGEDIR = '.';
+      download_doc($SINGLE_DOC);
+      convert_doc($SINGLE_DOC);
      exit;
    }      
   
    # Download the HTML docs and convert them all to pdfs
-    if (!$UPLOADONLY) { gethtmldocs('md','html','pdf','images'); }
+    if (!$UPLOADONLY) { gethtmldocs(); }

    # tar/compress
    my $date=`date +%Y%m%d%H%M`;
    chop $date;
    my $docname="xcat-docs-snap$date.tar.gz";
-    chdir($destdir) or die "Can not cd to $destdir: $!\n";

    my $cmd = "tar -zcf $docname html pdf images 2>&1";
    verbose($cmd);
@@ -114,82 +117,119 @@ my $usage = sub {

 sub gethtmldocs {

-    my $mddir = shift;
-    my $htmldir = shift;
-    my $pdfdir = shift;
-    my $imagedir = shift;
-    my $savedir = getcwd();
-    mkdir($mddir);
-    mkdir($htmldir);
-    mkdir($pdfdir);
-    mkdir($imagedir);
-    #delete all the files in the dirs in case they previously ran this
-    unlink <$mddir/*>;   
-    unlink <$htmldir/*>;   
-    unlink <$pdfdir/*>;   
-    unlink <$imagedir/*>;   
-                                    
-   print "\nDownloading and converting the xCAT wiki document list from $wiki_url ...\n";
-    my @doclist;
-    my %donelist;
-    foreach my $index (@indexdocs) {
-      if ( $donelist{$index} ) { next; }
-      my $indexcmd = "curl -X GET $wiki_url/$index";
-      verbose($indexcmd);
-      my $indexmd = `$indexcmd`;
-      if ($? && !$IGNOREERRORS) { die "error encountered in $indexcmd \n";}
-      my $jsout = $json->decode($indexmd); 
-      push @doclist,@{$jsout->{related_artifacts}};
-      foreach my $att (@{$jsout->{attachments}}) {
-        my $iwgetcmd = "wget -P $imagedir/ $att->{url}";
-        verbose($iwgetcmd);
-        system($iwgetcmd);
-        if ($? && !$IGNOREERRORS) { die "error encountered in $iwgetcmd \n";}
-      }
-      convert_doc($index,$jsout->{text},$mddir,$htmldir,$pdfdir,$imagedir);
-      $donelist{$index}=1;
-    }
-    print "\nDownloading and converting the xCAT wiki documentation to $savedir ...\n";
+    $MDDIR = 'md';
+    $HTMLDIR = 'html';
+    $PDFDIR = 'pdf';
+    $IMAGEDIR = 'images';

-    foreach my $doc (@doclist) {
-      my $doc_name = $doc;
-      $doc_name =~ s/\/.*\/(.+)\/$/$1/;
-      if ( $donelist{$doc_name} ) { next; }
-      verbose("processing $doc");
-      my $doc_url=$sf_url.$doc;
-      my $curlcmd = "curl -X GET $doc_url";
-      verbose($curlcmd);
-      my $pagecontent = `$curlcmd`;
-      my $pageout = $json->decode($pagecontent);
-      foreach my $pageatt (@{$pageout->{attachments}}) {
-         my $wgetcmd = "wget -P $imagedir/ $pageatt->{url}";
-         system($wgetcmd);
-         if ($? && !$IGNOREERRORS) { die "error encountered in $wgetcmd \n";}
-      }
-      convert_doc($doc_name,$pageout->{text},$mddir,$htmldir,$pdfdir,$imagedir);
-      $donelist{$doc_name}=1;
+    mkdir($MDDIR);
+    mkdir($HTMLDIR);
+    mkdir($PDFDIR);
+    mkdir($IMAGEDIR);
+    #delete all the files in the dirs in case they previously ran this
+     if ($CONTINUE) {
+        print "CONTINUING with files already in $MDDIR";
+        my @mdfiles = glob "$MDDIR/*.md";
+        foreach my $mdf (@mdfiles) {
+            $mdf =~ s/^$MDDIR\///;
+            $mdf =~ s/\.md//;
+            $LOADEDDOCS{$mdf}=1;
+         }
+     } else {
+         unlink <$MDDIR/*>;   
+         unlink <$HTMLDIR/*>;   
+         unlink <$PDFDIR/*>;   
+         unlink <$IMAGEDIR/*>;   
    }
-  
-    chdir($savedir);
+                                    
+   print "\nDownloading and converting the xCAT wiki document list from $WIKI_URL ...\n";
+    foreach my $index (@INDEXDOCS) {
+      my @related_docs = download_doc($index);
+      foreach my $docref (@related_docs) {
+        my $docref_name = $docref;
+        $docref_name =~ s/\/.*\/(.+)\/$/$1/;
+        download_doc($docref_name);
+      }
+    }
+
+    foreach my $doc (keys %LOADEDDOCS) {
+      convert_doc($doc);
+    }
+    return;
 }

+
+
+sub download_doc {
+    my $doc_name = shift;
+
+    if ( $LOADEDDOCS{$doc_name} ) { return; }
+    verbose("processing $doc_name");
+    $LOADEDDOCS{$doc_name}=1;
+
+    my $curlcmd = "curl --retry 5 -X GET $WIKI_URL/$doc_name";
+    verbose($curlcmd);
+    my $docjson = `$curlcmd`;
+    if ($? && !$IGNOREERRORS) { die "error encountered in $curlcmd \n";}
+
+    my $jsout = $json->decode($docjson); 
+
+    foreach my $att (@{$jsout->{attachments}}) {
+      my $wgetcmd = "wget -P $IMAGEDIR/ $att->{url}";
+      verbose($wgetcmd);
+      system($wgetcmd);
+      if ($? && !$IGNOREERRORS) { die "error encountered in $wgetcmd \n";}
+    }
+
+    open(MDFILE, ">$MDDIR/${doc_name}.md") or die "Could not open >$MDDIR/${doc_name}.md";
+    print MDFILE $jsout->{text};
+    close MDFILE;
+
+    return @{$jsout->{related_artifacts}};
+}   
+
+
+
 sub convert_doc {
   my $doc_name = shift;
-   my $doc_text = shift;
-   my $mddir = shift;
-   my $htmldir = shift;
-   my $pdfdir = shift;
-   my $imagedir = shift;
+
+   open(MDFILE, "<$MDDIR/${doc_name}.md") or die "Could not open <$MDDIR/${doc_name}.md";
+   my @doc_lines = <MDFILE>;
+   close MDFILE;
+   my $doc_text = join('',@doc_lines);
+
+   $doc_text = process_includes($doc_text,0);
+
+   if ($doc_text =~ /begin_xcat_table/) {
+       open(MDFILE, ">$MDDIR/${doc_name}.md") or die "Could not open >$MDDIR/${doc_name}.md";
+       print MDFILE $doc_text;
+       close MDFILE;
+
+       convert_tables($doc_name);
+
+       open(MDFILE, "<$MDDIR/${doc_name}.md") or die "Could not open <$MDDIR/${doc_name}.md";
+       @doc_lines = <MDFILE>;
+       close MDFILE;
+       $doc_text = join('',@doc_lines);
+   }

  ## Make image refs local
-   $doc_text =~ s/\!\[\]\(.+\/(.+)\.png\)/\!\[\]\(\.\.\/$imagedir\/$1\.png\)/g;
-   $doc_text =~ s/\!\[\]\(.+\/(.+)\.PNG\)/\!\[\]\(\.\.\/$imagedir\/$1\.PNG\)/g;
-   $doc_text =~ s/\!\[\]\(.+\/(.+)\.jpg\)/\!\[\]\(\.\.\/$imagedir\/$1\.jpg\)/g;
-   open(MDFILE, ">$mddir/${doc_name}.md") or die;
+   $doc_text =~ s/\!\[\]\(.+\/(.+)\.png\)/\!\[\]\(\.\.\/$IMAGEDIR\/$1\.png\)/g;
+   $doc_text =~ s/\!\[\]\(.+\/(.+)\.PNG\)/\!\[\]\(\.\.\/$IMAGEDIR\/$1\.PNG\)/g;
+   $doc_text =~ s/\!\[\]\(.+\/(.+)\.jpg\)/\!\[\]\(\.\.\/$IMAGEDIR\/$1\.jpg\)/g;
+   $doc_text =~ s/\[img src=(.+)\.png\]/\!\[\]\(\.\.\/$IMAGEDIR\/$1\.png\)/g;
+   $doc_text =~ s/\[img src=(.+)\.PNG\]/\!\[\]\(\.\.\/$IMAGEDIR\/$1\.PNG\)/g;
+   $doc_text =~ s/\[img src=(.+)\.jpg\]/\!\[\]\(\.\.\/$IMAGEDIR\/$1\.jpg\)/g;
+
+   ## Remove [TOC] entries
+   $doc_text =~ s/\[TOC\]//g;
+    
+
+   open(MDFILE, ">$MDDIR/${doc_name}.md") or die "Could not open >$MDDIR/${doc_name}.md";
   print MDFILE $doc_text;
   close MDFILE;

-   my $pandoccmd = "pandoc -s --toc $mddir/${doc_name}.md -o $htmldir/${doc_name}.html";
+   my $pandoccmd = "pandoc -s --toc $MDDIR/${doc_name}.md -o $HTMLDIR/${doc_name}.html";
   verbose($pandoccmd);
   system($pandoccmd);
   if ($? && !$IGNOREERRORS) { die "error encountered in $pandoccmd \n";}
@@ -197,18 +237,160 @@ sub convert_doc {
   # figure out what was going on:
   #   pandoc does different processing if target filetype is html
   #   but all internal refs only work in browser when there is no html filetype
-   rename "$htmldir/${doc_name}.html","$htmldir/${doc_name}";
+   rename "$HTMLDIR/${doc_name}.html","$HTMLDIR/${doc_name}";

-   $doc_text =~ s/\!\[\]\(\.\.\/$imagedir\/(.+)\.png\)/\!\[\]\(\.\/$imagedir\/$1\.png\)/g;
-   $doc_text =~ s/\!\[\]\(\.\.\/$imagedir\/(.+)\.PNG\)/\!\[\]\(\.\/$imagedir\/$1\.PNG\)/g;
-   $doc_text =~ s/\!\[\]\(\.\.\/$imagedir\/(.+)\.jpg\)/\!\[\]\(\.\/$imagedir\/$1\.jpg\)/g;
-   open(MDFILE, ">$mddir/${doc_name}.md") or die;
+   $doc_text =~ s/\!\[\]\(\.\.\/$IMAGEDIR\/(.+)\.png\)/\!\[\]\(\.\/$IMAGEDIR\/$1\.png\)/g;
+   $doc_text =~ s/\!\[\]\(\.\.\/$IMAGEDIR\/(.+)\.PNG\)/\!\[\]\(\.\/$IMAGEDIR\/$1\.PNG\)/g;
+   $doc_text =~ s/\!\[\]\(\.\.\/$IMAGEDIR\/(.+)\.jpg\)/\!\[\]\(\.\/$IMAGEDIR\/$1\.jpg\)/g;
+   open(MDFILE, ">$MDDIR/${doc_name}.md") or die "Could not open >$MDDIR/${doc_name}.md";
   print MDFILE $doc_text;
   close MDFILE;
-   my $pandoccmd2 = "pandoc --toc $mddir/${doc_name}.md -o $pdfdir/${doc_name}.pdf";
+   my $pandoccmd2 = "pandoc --toc $MDDIR/${doc_name}.md -o $PDFDIR/${doc_name}.pdf";
   verbose($pandoccmd2);
   system($pandoccmd2);
   if ($? && !$IGNOREERRORS) { die "error encountered in $pandoccmd2 \n";}

 }

+
+
+sub process_includes {
+    my $doc_text = shift;
+    my $include_nest = shift;
+ 
+    if ($include_nest++ > 10) { die "nested include processing greater than 10.  Infinite recursion???"; }
+
+    while (1) {
+        if ($doc_text =~ /\[\[(\s*)include (\s*)ref=(\s*)(.+)(\s*)\]\]/) {
+            my $next_include = $4;
+            download_doc($next_include);
+
+            open(INCLDFILE, "<$MDDIR/${next_include}.md") or die "Could not open <$MDDIR/${next_include}.md";
+            my @include_lines = <INCLDFILE>;
+            close INCLDFILE;
+
+#            my $include_text = join('\n', @include_lines);
+            my $include_text = join('', @include_lines);
+            $include_text = process_includes($include_text,$include_nest);
+
+            $doc_text =~ s/\[\[(\s*)include (\s*)ref=(\s*)$next_include(\s*)\]\]/$include_text/g;
+
+        } else {
+            last;
+        }
+    }
+   
+    return $doc_text;
+}
+
+
+sub convert_tables {
+  my $doc_name=shift;
+  my $infile="$MDDIR/${doc_name}.md";
+  my $outfile=$infile;
+
+  open(MDFILE, "<$infile") or die "Could not open <$infile";
+  my @inlines=<MDFILE>;
+  close MDFILE;
+  my @outlines;
+  my @tablines;
+
+  my $in_comment=0;
+  my $xcat_table=0;
+  my $numcols=1;
+  my @colwidths=(0);
+  my $tabcount=0;
+
+  verbose("converting tables in $doc_name");
+  foreach my $line (@inlines) {
+      if ($line =~ /\<\!---/) { $in_comment=1; next; } 
+      if ($in_comment) {
+         if ($line =~ /begin_xcat_table/) {$xcat_table=1; next;}
+         if ($xcat_table) {
+            if ($line =~ /numcols=(\d+)/) { $numcols=$1; next;}
+            if ($line =~ /colwidths=([\d,]+)/) { @colwidths=split(',',$1); next;}
+         }
+         if ($line =~ /end_xcat_table/) {
+             my $separator = '+';
+             foreach my $c (@colwidths) {
+                if ($c > 0) { $separator .= '-' x $c; }   
+                $separator .= '+';
+             }
+             $separator .= "\n";
+             my $headsep = $separator;
+             $headsep =~ s/-/=/g;
+             my $rowline = $separator;
+             $rowline =~ s/-/ /g;
+
+             my $nosep=0;
+             foreach my $tabline(@tablines) {
+               if ($tabline =~ /^\s*$/) { next;}
+               if ($tabline =~ /^\-\-/) { 
+                 push (@outlines,$headsep);
+                 $nosep = 1;
+                 next;
+               }
+               if ($nosep) { $nosep=0;} else {push (@outlines,$separator);}
+               $tabline =~ s/^\s*\|//;
+               my @vals = split (/\|/,$tabline);
+               my $last_cell_line=0;
+               my $colnum=0;
+               my @tabrow;
+               foreach my $c (@colwidths) {
+                 if ($c > 0) {
+                     my $colval=$vals[$colnum];
+                     $colval =~ s/(\s*)$//;
+                     my $vallen = length($colval);
+                     my $cell_line=0;
+                     while ($vallen >  $c) {
+                       $tabrow[$cell_line++][$colnum] = substr($colval,0,$c);
+                       $vallen -= $c;
+                       $colval = substr($colval,$c,$vallen);
+                     }
+                     $tabrow[$cell_line][$colnum] = substr($colval,0,$vallen);
+                     if ($vallen < $c) {
+                        $tabrow[$cell_line][$colnum] .= " " x ($c-$vallen);
+                     }
+                     $last_cell_line = max($cell_line,$last_cell_line);  
+                 }   
+                 $colnum++; 
+               }
+
+               my @rowlines;
+               for (my $i=0;$i<=$last_cell_line;$i++) {
+                 for (my $j=0;$j<=$numcols-1;$j++) {
+                   $rowlines[$i] .= "|";
+           if ($tabrow[$i][$j]) { $rowlines[$i] .= $tabrow[$i][$j]; }
+                   else { $rowlines[$i] .= " " x $colwidths[$j]; }
+                 }
+                 $rowlines[$i] .= "|\n";
+               }
+               push (@outlines,@rowlines);
+            }
+            push (@outlines,$separator);
+
+            # reset to process next table
+            @tablines = ();
+            $xcat_table=0; $numcols=1;@colwidths=(0);next;
+         }
+         if ($line =~ /--\>/) {$in_comment=0;next;}
+         next;
+      }
+      if ($xcat_table) { push (@tablines,$line); next; }
+
+      push (@outlines,$line);
+      next;
+  }
+
+  open(MD2FILE, ">$outfile") or die "Could not open >$outfile";
+  print MD2FILE @outlines;
+  close MD2FILE;
+
+  return;
+
+
+
+
+
+
+}