#!/usr/bin/env python

# Joshua Sled (c) 2005 <jsled@>

# todo:
# -----
# xml:lang

# fixme:
# ------
# seperate blank nodes from fragment ids/named nodes

import os, sys, xml.dom.minidom

USAGE = """%(name)s -- parses the given RX-format XML into NTriples.
Usage:
$ %(name)s input
    input: a file or '-' to indicate stdin.
Output is to stdout.
""" % { 'name': os.sys.argv[0] }

class NS:
    def __init__(self, baseUri):
        self.baseUri = baseUri
    def __getitem__(self, key):
        return self.baseUri + key
    def __str__(self):
        return self.baseUri

nsIS = NS("http://asynchronous.org/rx/ns/2005/01/is#")
nsRDF = NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
nsXML = NS("http://www.w3.org/XML/1998/namespace")

class URI:
    def __init__(self, uri):
        self.uri = uri
    def __str__(self):
        return "<%s>" % (self.uri)

class Literal:
    def __init__(self, val, datatype = None):
        """
        @param val literal value
        @param datatype:URI
        """
        self.val = val
        self.datatype = datatype
    def __str__(self):
        dtPart = ""
        if self.datatype:
            dtPart = "^^%s" % ( self.datatype )
        return "\"%s\"%s" % ( self.val, dtPart )

class BlankNode:
    id = 0
    def __init__(self, name=None):
        id = name
        if name == None:
            id = BlankNode.id
            BlankNode.id += 1
        self.id = id
    def __str__(self):
        return "_:%s" % ( str(self.id) )

class Triple:
    """A simple triple."""
    def __init__( self, s, p, o ):
        self.s,self.p,self.o = s,p,o
    def __str__( self ):
        return "%s %s %s ." % ( self.s, self.p, self.o )

def isMixedContent( node ):
    textChild,eltChild = False,False
    for child in node.childNodes:
        if child.nodeType == node.TEXT_NODE:
            # skip over whitespace nodes
            if len(child.data.strip()) == 0:
                continue
        textChild |= (child.nodeType == node.TEXT_NODE)
        eltChild |= (child.nodeType == node.ELEMENT_NODE)
    return (textChild and eltChild)

def isTextOnlyContent( node ):
    return len(filter( lambda n: n.nodeType != n.TEXT_NODE, node.childNodes )) == 0

def getTextContent( node ):
    toRet = ''
    for child in node.childNodes:
        if child.nodeType == child.TEXT_NODE:
            toRet += child.data
    return toRet.strip()

def isEmpty( node ):
    return len(node.childNodes) == 0

def rxSubjectOf( node ):
    """
    If the node has ``is:about``, then return the appropriate either form of that; otherwise -- a blank node.
    """
    about =  node.getAttributeNS( str(nsIS), 'about' )
    if about != '':
        if about.startswith('#'):
            subj = BlankNode( about[1:] )
        else:
            subj = URI(about)
    else:
        subj = BlankNode()
    return subj

def parseNode( node, subject, isListOf = None, addRelation = True ):
    """
    @param node an XML minidom node
    @param subject the subject of the triples for statements in this node.
    @param isListOf:URI if non-None, then don't add the connecting statement.
    @return A list of ``Triple``s parsed from this node and it's descendents.
    """
    triples = []
    tagRelation = URI( node.namespaceURI + node.localName )

    isLiteral = node.getAttributeNS( str(nsIS), 'literalXml' )
    if isLiteral != '' and isLiteral == 'true':
        ct = ''
        for c in node.childNodes:
            ct += c.toxml()
        return [ Triple( subject, tagRelation, Literal( ct, URI(nsRDF['XMLLiteral']) ) ) ]

    if isMixedContent( node ):
        raise Exception( "we don't handle mixed content" )
    
    if isEmpty(node):
        object = rxSubjectOf( node )
        triples.append( Triple( subject, tagRelation, object ) )

    elif isTextOnlyContent(node):
        text = getTextContent(node)
        datatype = node.getAttributeNS( str(nsIS), 'ofDatatype' )
        if datatype == '':
            datatype = None
        else:
            datatype = URI(datatype)
        lit = Literal(text, datatype)
        triples.append( Triple( subject, tagRelation, lit ) )

    else:
        newSubj = rxSubjectOf( node )

        if isListOf and str(isListOf) == str(tagRelation):
            triples.append( Triple( subject, URI(nsRDF['first']), newSubj ) )
        elif addRelation:
            triples.append( Triple( subject, tagRelation, newSubj ) )
        else:
            assert("unknown state")

        isA = node.getAttributeNS( str(nsIS), 'a' )
        if isA != '':
            triples.append( Triple( newSubj, URI(nsRDF['type']), URI(isA) ) )

        listEltUri = node.getAttributeNS( str(nsIS), 'aListOf' )
        listStart = None
        currentListNode = None

        for child in filter( lambda n: n.nodeType == n.ELEMENT_NODE, node.childNodes ):
            # by default, we'll parse the node as a regular property...
            isList = (child.namespaceURI + child.localName) == listEltUri
            if isList:
                # parse the node as a list element...
                newNode = BlankNode()
                if currentListNode != None:
                    # chain forward
                    triples.append( Triple( currentListNode, URI(nsRDF['rest']), newNode ) )
                else:
                    listStart = newNode
                currentListNode = newNode

                parsedChildTriples = parseNode( child, currentListNode, URI(listEltUri) );
                if isEmpty( child ) or isTextOnlyContent( child ):
                    # if literal, strip the literal object out from the
                    # parsed content and place it immediately as the object
                    # of the list triple.
                    triples.append( Triple( currentListNode, URI(nsRDF['first']), parsedChildTriples[0].o ) )
                else:
                    triples.extend( parsedChildTriples )
            else:
                parsedChildTriples = parseNode( child, newSubj )
                triples.extend( parsedChildTriples )

        # if we ended up creating a list, then add ingress/egress/type.
        if currentListNode != None:
            triples.extend( [ Triple( newSubj, URI(listEltUri), listStart ),
                              Triple( listStart, URI(nsRDF['type']), URI(nsRDF['List']) ),
                              Triple( currentListNode, URI(nsRDF['rest']), URI(nsRDF['nil']) ) ] )
    return triples

def parsePureRx( root ):
    triples = []
    for child in root.childNodes:
        if child.nodeType == child.TEXT_NODE:
            continue
        name = child.namespaceURI + child.localName
        if name != (nsIS['aDescription']):
            continue
        subj = rxSubjectOf( child )
        isA = child.getAttributeNS( str(nsIS), 'a' )
        if isA != '':
            triples.append( Triple( subj, URI(nsRDF['type']), URI(isA) ) )
        for c in filter( lambda n: n.nodeType == n.ELEMENT_NODE, child.childNodes ):
            triples.extend( parseNode( c, subj ) )
    return triples

def parse( file ):
    """
    Parse the given file as rx-format XML.
    """
    dom = xml.dom.minidom.parse( file )
    root = dom.childNodes[0]
    parsedTriples = []
    if (root.namespaceURI + root.localName) == nsIS['stuff']:
        parsedTriples.extend( parsePureRx( root ) )
    else:
        subj = rxSubjectOf(root)
        parsedTriples.append( Triple( subj, URI(nsRDF['type']), URI( root.namespaceURI + root.localName ) ) )
        for c in filter( lambda n: n.nodeType == n.ELEMENT_NODE, root.childNodes ):
            parsedTriples.extend( parseNode( c, subj ) )
    return parsedTriples

if __name__ == "__main__":
    if len(os.sys.argv) == 1:
        print USAGE
        sys.exit(1)
    filename = os.sys.argv[1]
    file = None
    if filename == "-":
        file = sys.stdin
    else:
        file = open( filename )
    triples = parse( file )
    for triple in triples:
        print triple
