Stories
Slash Boxes
Comments
NOTE: use Perl; is on undef hiatus. You can read content, but you can't post it. More info will be forthcoming forthcomingly.

All the Perl that's Practical to Extract and Report

The Fine Print: The following comments are owned by whoever posted them. We are not responsible for them in any way.
 Full
 Abbreviated
 Hidden
More | Login | Reply
Loading... please wait.
  • write a program that deduced an XML Schema from a large bunch of XML documents in under thirty minutes :)

    Could you extract the central ideas of that and possibly upload it as a module. I am forced to use XML-Schema at work, and would love to have more perl tools for it.
    Yes I know that it is a mess. To quote Andy Wardley:
    My conclusion, after spending nearly 6 months working on it, is that it is an utter bastard of a specification written by a committee of Java XML product vendors. 200+ pages of twisted

    • It's not at all CPAN worthy. It currently only deals with a single namespace, and has some stuff hardwired for SVG (I needed the schema that is just enough to describe a bunch of SVG documents, not the complete one). Adding multinamespace support shouldn't be too hard, though the loops in the data structures will get a bit twisted. Here is the code, raw. It may help you in some ways, you never know, all help is good when it comes to XML Schema ;) If you don't understand the data structure which I know is hairy, turn DEBUG on and it'll dump it. Dumped, it becomes a lot clearer. This is a quick and dirty hack, all disclaimers apply.

      Andy's view on XML Schema is right, I know, I've taken part in an implementation... Thank $DEITY for <ecode>.

      # simple schema generator

      use strict;
      use XML::SAX::ParserFactory;
      use Data::Dumper;

      use constant DEBUG => 0;

      # conf
      my @overrideFiles = qw();

      # list files
      my $dir = '.';
      opendir DIR, $dir or die "Can't open $dir";
      my @files = grep /\.xml$/, readdir DIR;
      closedir DIR;

      # process that
      my %info;
      my $p = XML::SAX::ParserFactory->parser( Handler => SchemaGenerator->new(\%info) );
      @files = @overrideFiles ? @overrideFiles : @files;
      for my $f (@files) {
          eval {
              $p->parse_uri($f);
          };
          print "Error: '$@' in document '$f'\n" if $@;
      }

      print Dumper(\%info) if DEBUG;

      # generate the schema
      my $cnt = 0;
      my $elements;
      for my $ns (keys %{$info{EltOccurs}}) { # for each ns
          open my $sch, ">schema$cnt.xsd" or die "Couldn't open schema$cnt.xsd: $!";
          for my $el (keys %{$info{EltOccurs}->{$ns}}) { # for each element
              my $hasCM = scalar keys %{$info{EltContains}->{$ns}->{$el}->{$ns}};
              my $hasTxt = $info{EltText}->{$ns}->{$el} ? " mixed='true'" : '';

              # we need to deal with text here
              $elements .= "  <element name='$el'$hasTxt>\n    <complexType>\n";
              $elements .= "      <choice minOccurs='0' maxOccurs='unbounded'>\n" if $hasCM;
              # elements (we don't do multins right now)
              #for my $ref (keys %{$info{EltContains}->{$ns}}) {
              for my $ref (keys %{$info{EltContains}->{$ns}->{$el}->{$ns}}) {
                  $elements .= "        <element ref='svg:$ref'/>\n";
              }
              $elements .= "      </choice>\n" if $hasCM;

              # attributes
              for my $attr (keys %{$info{AttrOccurs}->{$ns}->{$el}->{''}}) {
                  next if $attr eq 'xmlns';
                  $elements .= "      <attribute name='$attr' type='string'/>\n";
              }
              $elements .= "    </complexType>\n  </element>\n\n";
          }

          # we don't do multiNS
          print $sch <<"    EOXSD";
      <schema
              targetNamespace='$ns'
              xmlns='http://www.w3.org/2001/XMLSchema'
              xmlns:xsd='http://www.w3.org/2001/XMLSchema'
              xmlns:svg='$ns'
              elementFormDefault='qualified'
              >

          EOXSD

          print $sch $elements;
          print $sch "</schema>\n";
          close $sch;
      }

      # handler
      package SchemaGenerator;

      sub new {
          my $class = shift;
          my $info  = shift;

          my %self;
          $self{info} = $info;
          $self{ctx}  = [];
          return bless \%self, $class;
      }

      sub start_element {
          my $self = shift;
          my $el   = shift;

          my $ctx  = $self->{ctx};
          my $info = $self->{info};
          if (@$ctx) {
              $info->{EltContains}->{$ctx->[-1][0]}->{$ctx->[-1][1]}
                                  ->{$el->{NamespaceURI}}->{$el->{LocalName}}++;
          }
          $info->{EltOccurs}->{$el->{NamespaceURI}}->{$el->{LocalName}}++;

          for my $attr (values %{$el->{Attributes}}) {
              $info->{AttrOccurs}
                   ->{$el->{NamespaceURI}}->{$el->{LocalName}}
                   ->{$attr->{NamespaceURI}}->{$attr->{LocalName}}++;
          }

          push @{$self->{ctx}}, [ $el->{NamespaceURI}, $el->{LocalName} ];
      }

      sub end_element {
          pop @{$_[0]->{ctx}};
      }

      sub characters {
          my $self = shift;
          my $data = shift->{Data};
          return unless $data =~ m/\S/;

          my $ctx = $self->{ctx};
          $self->{info}->{EltText}->{$ctx->[-1][0]}->{$ctx->[-1][1]}++;
      }
      --

      -- Robin Berjon [berjon.com]