- #! /usr/bin/env awk -f ############################################################################### ## ## USAGE: [g|m|n]awk [-v FIX=0|1] [-v STY=<targetformat>] -f GED_UID.fix.awk [<]infile.GED [>outfile.GED] ## NOTES: ## list of actions executed for GEDCOM-files ... ## #~ search _UID-tag-strings for patterns of 16 (or +2 for checksum) octet-sequences in hexadigit notation #~ ignore (i.e. accept) several other characters common for various UUID-formats/standards (delimiters, separators, prefixes) #~ if pattern-matching succeeds #~ transform the leading 16 octets into numbers representing the significant 128-bit-value of a UUID #~ compute a 2-octet checksum (PAF-GEDCOM-_UID-algorithm), transform the checksum into a 4-hexadigit string #~ compose a new UUID-representation in <targetformat> from source-octets (accordingly plus new checksum) #~ compare source-string of UUID-representation (incl. all surplus characters) with target-string/format #~ if pattern-matching fails #~ create a new UUID-string in <targetformat> #~ if FIX==boolean(true) replace source-string with target-string, output whole (fixed) GEDCOM-file #~ if FIX==boolean(false) output source GEDCOM_UID-line followed by newline computed/compared UUID #~ ___1-prefix-tag : <sourceformat> and <targetformat> are identical ("true") #~ ___X-prefix-tag : different formats, but significant 128-bit-value preserved in valid <targetformat> #~ ___0-prefix-tag : no match for any UUID-128-bit-value, new 128-bit generated in valid <targetformat> ## ## OPTIONS: ## ## -v FIX=0|1 ## 0 evaluates to false : (default) check _UID-tags for conformance with <targetformat>, output comparison ## 1 evaluates to true : transform ("fix") _UID-tags value into <targetformat>, output fixed GEDCOM-file ## ## -v STY="_UID"|"GUID"|"UUID"|"XUID"|"UURN"|"XURN"|<targetformat> ## "_UID" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCCCC (default) ## PAF-GEDCOM-_UID 16+2 bytes, 36 chars uppercase hexdigit with checksum ## "UUID" xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx ## RFC-4122-UUIDv4 16 bytes, 32+4 chars lowercase hexdigit hyphen-grouped ## "GUID" {XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX} ## embraced {UUIDv4} 16 bytes, 32+6 chars uppercase hexdigit hyphen-grouped ## "XUID" {XXXxXXxx-XXxX-XxXx-Xxxx-xxXXxXXxXXxx}cccc ## extended mixedcase and -style {GUIDv4}, 4-hexdigit checksum appended ## "UURN" urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx ## prefixed lowercase "urn:uuid:UUIDv4" (RFC-4122, UUID as URN) ## "XURN" urn:uuid:xXXxxXXx-XXxx-XXxx-XXXX-xXxxXXxXxxxX+cccc ## extended mixedcase "urn:uuid:UUIDv4+checksum" (RFCs 2141+3986+4122) ## else: XXXXxxxx-XXxX-XxXX-XXXX-XXXxXxXxxxXx cccc ## combined mixedcase UUIDv4 with 4-hexdigit checksum (set apart) ## #~ New (self-) generated UUIDs are always of RFC-4122 random type v4, #~ independent of a grouped or straight format. Divergent from standard, #~ the generator outputs randomly mixed-case letters. The non-standard #~ XUID-, XURN- and fallback-targetformats (if user's choice of "format" is an #~ undefined token) are case-preserving, but easy to convert. #~ #~ Given valid source-values (and their notation-fragments) take precedence #~ over generated values. Joint with the case-preserving XUID-, XURN- and #~ fallback-targetformats, a mixed-case output may result from the source #~ (copy of case) or the generator (randomly mixed case). But as long as #~ vendors do not provide an algorithm of creation, mixedcase source-UUIDs #~ are not really comparable at string-level. The patterns are most likely #~ always different and recommended for change. Beyond that, the lettercase #~ is not recoverable after a normalization or change of format. ## ############################################################################### ## #~ RFC-2141, URN Syntax #~ #~ Some namespaces may define additional lexical equivalences, such as #~ case-insensitivity of the NSS (or parts thereof). #~ #~ RFC-4122, A Universally Unique IDentifier (UUID) URN Namespace #~ #~ The internal representation of a UUID is a specific sequence of #~ bits in memory, [...]. To accurately #~ represent a UUID as a URN, it is necessary to convert the bit #~ sequence to a string representation. #~ #~ Each field is treated as an integer and has its value printed as a #~ zero-filled hexadecimal digit string with the most significant #~ digit first. The hexadecimal values "a" through "f" are output as #~ lower case characters and are case insensitive on input. ## ############################################################################### ## ## 2013 ~ Stefan Unterstein <http://unterstein.net/ged1212xml> ## ## This program is free software ("freeware"): ## you can redistribute it and/or modify it as you like. ## ## The program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ## ############################################################################### - BEGIN { FIX = FIX ? !!FIX : 0 ; STY = (STY~/^([_GUX]UID|[UX]URN)$/) ? STY : ((STY~/^$/) ? "_UID" : "") ; - # # FIX default = 0 = false = check only # = output given vs computed _UIDs only # # awk -f GED_UID.fix.awk infile.ged | grep -B 1 "___[0X]" # # will filter flawed _UIDs and computed replacements # # awk -v FIX=1 -f GED_UID.fix.awk inflawed.ged > outfixed.ged # # will fix 'em (to PAF-compatible _UIDs) # # awk -v FIX=1 -v STY=UUID -f GED_UID.fix.awk inflawed.ged > outfixed.ged # # will fix 'em (to RFC-compatible UUIDs) # mkXB2N(xbyte); split("01234567cdef89ab89AB01234567CDEF",xchar,""); - # # make xbyte an array of HexDigit-Byte-(zero-filled)-Indices-to-Number # xbyte["00"]=0 xbyte["01"]=1 .. "ff"="Ff"="fF"="FF"=255 # # xchar for UUIDv4 = xxxxxxxx-xxxx-4xxx-Yxxx-xxxxxxxxxxxx # # usage lower case: # x = xchar[int(rand()*16+1)] # y = xchar[int(rand()*4+13)] # # usage mixed case: # x = xchar[int(rand()*32+1)] # y = xchar[int(rand()*8+13)] # # usage upper case: # x = xchar[int(rand()*16+17)] # y = xchar[int(rand()*4+17)] # Hx01RE = "[0-9a-fA-F]"; Hx02RE = Hx01RE Hx01RE; # octet/byte Hx04RE = Hx02RE "-?" Hx02RE; Hx08RE = Hx04RE "-?" Hx04RE; Hx12RE = Hx04RE "-?" Hx04RE "-?" Hx04RE; chksRE = "([- +]?" Hx04RE ")?" xuidRE = "{?" Hx08RE "-?" Hx04RE "-?" Hx04RE "-?" Hx04RE "-?" Hx12RE "}?" chksRE; xurnRE = "([uU][rR][nN]:[uU][uU][iI][dD]:)?" xuidRE; - # # captures GUIDs, UUIDs, _UIDs, URNs prefix, with or w/o plus|minus|space checksum, any lettercase, any hyphen-byte-grouping #> marks output- or replacement-formats, four of them canonical or quasi-standards # # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx #> XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCCCC # {xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx} # {xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx}cccc #> xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxxcccc #> {XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX} #> {XxXXXxXX-xxxX-XXXx-XxxX-XxXXxxxXXXxX}cccc #> urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx #> urn:uuid:XXxXXXXx-XXxX-xXxx-Xxxx-xxxxXxxXXxxX+cccc # # ... any hyphen-byte-grouping from none to all (grouping half-byte "nibbles" doesn't make any sense to me) # # xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx # xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx cc-cc # {xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx}±cc-cc # } - ############################################################################### ############################################################################### - $1 ~ /[0-9]/ && $2 == "_UID" { gvnUUID = match($3,xurnRE) ? substr($3,RSTART,RLENGTH) : "" ; if (gvnUUID) - { - #~ valUUID = (length(gvnUUID)>=32) ? substr(gvnUUID,1,32) : "" ; #~ chkUUID = (length(gvnUUID)==36) ? substr(gvnUUID,33,4) : "" ; cmpUUID = uuid4matter(gvnUUID,STY); if ($0 == ($1 " " $2 " " cmpUUID)) - { $0 = (FIX) ? $0 : ($0 "\n" $1+1 " ___1 " cmpUUID) ; - # # true <targetformat> and value, comp'd and given ID+checksum are identical # if <targetformat>==_UID (default), value and format are likely to be accepted by PAF-compatibles # } else { $0 = ((FIX) ? ($1 " " $2 " ") : ($0 "\n" $1+1 " ___X ")) cmpUUID ; - # # true UUID 128-bit value, but false format or checksum, or surplus characters # value now preserved and transformed into <targetformat>, accordingly plus new checksum # if <targetformat>==_UID and not eXchanged, this and next are likely to be rejected by PAF-compatibles # } } else { $0 = ((FIX) ? ($1 " " $2 " ") : ($0 "\n" $1+1 " ___0 ")) uuid4matter(mkUUID(),STY) ; - # # false, no (valid) UUID or 128-bit-value available, new UUID in <targetformat> generated # } if (!FIX) print; } ############################################################################### FIX { print; } - ############################################################################### ############################################################################### # functions ############################################################################### function mkUUID( UUID) # 31 rand() per UUID, miXed case; depends on global xchar[] - { UUID = "xxxxxxxx-xxxx-4xxx-" xchar[int(rand()*8+13)] "xxx-xxxxxxxxxxxx"; while(sub(/x/,xchar[int(rand()*32+1)],UUID)); return UUID; } function mkXB2N(a, i,j,x,X,n) # make HexDigit-Byte-(zero-filled)-to-Number Array - { split("0123456789abcdef",x,""); split("0123456789ABCDEF",X,""); n=0; for (i=1; i<17; i++) - { for (j=1; j<17; j++) - { a[x[i]""x[j]]=a[x[i]""X[j]]=a[X[i]""x[j]]=a[X[i]""X[j]]=n++; } } } function uuid4matter(UUID,fmt, BytesSum1,BytesSum2,ChecksHex,CanonUUID,n) - { gsub(/([uU][rR][nN]:[uU][uU][iI][dD]:)|[-{ }+]/,"",UUID); UUID = substr(UUID,1,32); for (n=1; n<17; n++) - { BytesSum1 += xbyte[substr(UUID,n*2-1,2)]; # mkXB2N(xbyte); # xbyte["00"]=0 xbyte["01"]=1 .. "ff"="Ff"="fF"="FF"=255 BytesSum2 += BytesSum1; } ChecksHex = sprintf("%02x%02x",BytesSum1 % 256,BytesSum2 % 256); CanonUUID = substr(UUID,1,8) "-" substr(UUID,9,4) "-" substr(UUID,13,4) "-" substr(UUID,17,4) "-" substr(UUID,21,12); - if (fmt=="_UID") { return toupper(UUID ChecksHex); } else if (fmt=="GUID") { return "{" toupper(CanonUUID) "}"; } else if (fmt=="UUID") { return tolower(CanonUUID); } else if (fmt=="XUID") { return "{" CanonUUID "}" ChecksHex; } else if (fmt=="UURN") { return "urn:uuid:" tolower(CanonUUID); } else if (fmt=="XURN") { return "urn:uuid:" CanonUUID "+" ChecksHex; } else return CanonUUID " " ChecksHex; } - ############################################################################### # EOF ###############################################################################