--- regexp.h	2001/01/04 23:22:09	1.1
+++ regexp.h	2001/01/04 23:22:16
@@ -30,6 +30,7 @@
         struct reg_data *data;	/* Additional data. */
 	char *subbeg;		/* saved or original string 
 				   so \digit works forever. */
+        U32 *offsets;           /* offset annotations 20001228 MJD */
 	I32 sublen;		/* Length of string pointed by subbeg */
 	I32 refcnt;
 	I32 minlen;		/* mininum possible length of $& */
--- regcomp.c	2000/12/18 21:26:57	1.1
+++ regcomp.c	2001/01/16 23:28:12
@@ -115,9 +115,11 @@
     U16		flags16;		/* are we folding, multilining? */
     char	*precomp;		/* uncompiled string. */
     regexp	*rx;
+    char	*start;			/* Start of input for compile */
     char	*end;			/* End of input for compile */
     char	*parse;			/* Input-scan pointer. */
     I32		whilem_seen;		/* number of WHILEM in this expr */
+    regnode	*emit_start;		/* Start of emitted-code area */
     regnode	*emit;			/* Code-emit pointer; &regdummy = don't = compiling */
     I32		naughty;		/* How bad is this pattern? */
     I32		sawback;		/* Did we see \1, ...? */
@@ -136,10 +138,13 @@
 #define RExC_flags16	(pRExC_state->flags16)
 #define RExC_precomp	(pRExC_state->precomp)
 #define RExC_rx		(pRExC_state->rx)
+#define RExC_start	(pRExC_state->start)
 #define RExC_end	(pRExC_state->end)
 #define RExC_parse	(pRExC_state->parse)
 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
+#define RExC_offsets	(pRExC_state->rx->offsets) /* I am not like the others */
 #define RExC_emit	(pRExC_state->emit)
+#define RExC_emit_start	(pRExC_state->emit_start)
 #define RExC_naughty	(pRExC_state->naughty)
 #define RExC_sawback	(pRExC_state->sawback)
 #define RExC_seen	(pRExC_state->seen)
@@ -152,6 +157,7 @@
 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 	((*s) == '{' && regcurly(s)))
+
 #ifdef atarist
 #define	PERL_META	"^$.[()|?+*\\"
 #else
@@ -421,6 +427,54 @@
 /* Allow for side effects in s */
 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
 
+/* Macros for recording node offsets.   20001227 mjd@plover.com 
+ * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
+ * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
+ * Element 0 holds the number n.
+ */
+
+/* A node number larger than this indicates an arithmetic fuckup. */
+#define MAX_SAFE_NODE 10000
+#define MJD_OFFSET_DEBUG(x) 
+/* #define MJD_OFFSET_DEBUG(x) fprintf x */
+
+#  define Set_Node_Offset_To_R(node,byte)                           \
+   STMT_START {                                        \
+     if (! SIZE_ONLY) {                                  \
+       if((node) < 0 || (node) > MAX_SAFE_NODE) {                   \
+         Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
+       } else {                                                        \
+         RExC_offsets[2*(node)-1] = (byte);                               \
+       }                                                               \
+     }                                                                 \
+   } STMT_END
+
+#  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
+#  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
+
+#  define Set_Node_Length_To_R(node,len)                            \
+   STMT_START {                                        \
+     if (! SIZE_ONLY) {                                  \
+       MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
+       if((node) < 0 || (node) > MAX_SAFE_NODE) {                   \
+         Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
+       } else {                                                        \
+         RExC_offsets[2*(node)] = (len);                               \
+       }                                                               \
+     }                                                                 \
+   } STMT_END
+
+#  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
+#  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
+#  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
+
+
+
+
+/* Get offsets and lengths */
+#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
+#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
+
 static void clear_re(pTHXo_ void *r);
 
 /* Mark that we cannot extend a found fixed substring at this point.
@@ -1601,6 +1655,7 @@
 
     /* First pass: determine size, legality. */
     RExC_parse = exp;
+    RExC_start = exp;
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
@@ -1648,6 +1703,15 @@
     r->startp = 0;			/* Useful during FAIL. */
     r->endp = 0;			/* Useful during FAIL. */
 
+    Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
+    if (r->offsets) {
+      r->offsets[0] = RExC_size; 
+    }
+    DEBUG_r(PerlIO_printf(Perl_debug_log, 
+                          "%s %u bytes for offset annotations.\n", 
+                          r->offsets ? "Got" : "Couldn't get", 
+                          (2*RExC_size+1) * sizeof(U32)));
+
     RExC_rx = r;
 
     /* Second pass: emit code. */
@@ -1655,6 +1719,7 @@
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
+    RExC_emit_start = r->program;
     RExC_emit = r->program;
     /* Store the count of eval-groups for security checks: */
     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
@@ -2052,13 +2117,14 @@
 		    if (!SIZE_ONLY)
 			ret->flags = 2;
 		    regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+                    /* Deal with length of this later - MJD */
 		    return ret;
 		}
 		return reganode(pRExC_state, EVAL, n);
 	    }
-	    case '(':
+	    case '(':           /* (?(?{...})...); (?(?=...)...) */
 	    {
-		if (RExC_parse[0] == '?') {
+		if (RExC_parse[0] == '?') { /* (?(?...)) */
 		    if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 
 			|| RExC_parse[1] == '<' 
 			|| RExC_parse[1] == '{') { /* Lookahead or eval. */
@@ -2072,11 +2138,13 @@
 		    } 
 		}
 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+                    /* (?(1)...) */
 		    parno = atoi(RExC_parse++);
 
 		    while (isDIGIT(*RExC_parse))
 			RExC_parse++;
-		    ret = reganode(pRExC_state, GROUPP, parno);
+                    ret = reganode(pRExC_state, GROUPP, parno);
+                    
 		    if ((c = *nextchar(pRExC_state)) != ')')
 			vFAIL("Switch condition not recognized");
 		  insert_if:
@@ -2121,7 +2189,7 @@
                 break;
 	    default:
 		--RExC_parse;
-	      parse_flags:
+	      parse_flags:      /* (?i) */
 		while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
 		    if (*RExC_parse != 'o')
 			pmflag(flagsp, *RExC_parse);
@@ -2149,14 +2217,16 @@
 		return NULL;
 	    }
 	}
-	else {
+	else {                  /* (...) */
 	    parno = RExC_npar;
 	    RExC_npar++;
 	    ret = reganode(pRExC_state, OPEN, parno);
+            Set_Node_Length(ret, 1); /* MJD */
+            Set_Node_Offset(ret, RExC_parse); /* MJD */
 	    open = 1;
 	}
     }
-    else
+    else                        /* ! paren */
 	ret = NULL;
 
     /* Pick up the branches, linking them together. */
@@ -2211,6 +2281,8 @@
 	    break;
 	case 1:
 	    ender = reganode(pRExC_state, CLOSE, parno);
+            Set_Node_Offset(ender,RExC_parse+1); /* MJD */
+            Set_Node_Length(ender,1); /* MJD */
 	    break;
 	case '<':
 	case ',':
@@ -2364,6 +2436,7 @@
     op = *RExC_parse;
 
     if (op == '{' && regcurly(RExC_parse)) {
+        char * parse_start = RExC_parse; /* MJD */
 	next = RExC_parse + 1;
 	maxpos = Nullch;
 	while (isDIGIT(*next) || *next == ',') {
@@ -2392,6 +2465,7 @@
 	    RExC_parse = next;
 	    nextchar(pRExC_state);
 
+            Set_Node_Cur_Length(ret); /* Premature?  MJD */
 	do_curly:
 	    if ((flags&SIMPLE)) {
 		RExC_naughty += 2 + RExC_naughty / 2;
@@ -2408,6 +2482,7 @@
 		    NEXT_OFF(ret) = 3;	/* Go over LONGJMP. */
 		}
 		reginsert(pRExC_state, CURLYX,ret);
+
 		if (!SIZE_ONLY && RExC_extralen)
 		    NEXT_OFF(ret) = 3;	/* Go over NOTHING to LONGJMP. */
 		regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
@@ -2514,6 +2589,7 @@
 {
     register regnode *ret = 0;
     I32 flags;
+    char *parse_start = RExC_parse;
 
     *flagp = WORST;		/* Tentatively. */
 
@@ -2528,6 +2604,7 @@
 	    ret = reg_node(pRExC_state, SBOL);
 	else
 	    ret = reg_node(pRExC_state, BOL);
+        Set_Node_Length(ret, 1); /* MJD */
 	break;
     case '$':
 	nextchar(pRExC_state);
@@ -2539,6 +2616,7 @@
 	    ret = reg_node(pRExC_state, SEOL);
 	else
 	    ret = reg_node(pRExC_state, EOL);
+        Set_Node_Length(ret, 1); /* MJD */
 	break;
     case '.':
 	nextchar(pRExC_state);
@@ -2548,6 +2626,7 @@
 	    ret = reg_node(pRExC_state, REG_ANY);
 	*flagp |= HASWIDTH|SIMPLE;
 	RExC_naughty++;
+        Set_Node_Length(ret, 1); /* MJD */
 	break;
     case '[':
     {
@@ -2559,6 +2638,7 @@
 	}
 	nextchar(pRExC_state);
 	*flagp |= HASWIDTH|SIMPLE;
+        Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
 	break;
     }
     case '(':
@@ -2605,28 +2685,33 @@
 	    ret = reg_node(pRExC_state, SBOL);
 	    *flagp |= SIMPLE;
 	    nextchar(pRExC_state);
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'G':
 	    ret = reg_node(pRExC_state, GPOS);
 	    RExC_seen |= REG_SEEN_GPOS;
 	    *flagp |= SIMPLE;
 	    nextchar(pRExC_state);
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'Z':
 	    ret = reg_node(pRExC_state, SEOL);
 	    *flagp |= SIMPLE;
 	    nextchar(pRExC_state);
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'z':
 	    ret = reg_node(pRExC_state, EOS);
 	    *flagp |= SIMPLE;
 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
 	    nextchar(pRExC_state);
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'C':
 	    ret = reg_node(pRExC_state, SANY);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'X':
 	    ret = reg_node(pRExC_state, CLUMP);
@@ -2634,6 +2719,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_mark)
 		is_utf8_mark((U8*)"~");		/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'w':
 	    ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
@@ -2641,6 +2727,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_alnum)
 		is_utf8_alnum((U8*)"a");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'W':
 	    ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
@@ -2648,6 +2735,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_alnum)
 		is_utf8_alnum((U8*)"a");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'b':
 	    RExC_seen_zerolen++;
@@ -2657,6 +2745,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_alnum)
 		is_utf8_alnum((U8*)"a");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'B':
 	    RExC_seen_zerolen++;
@@ -2666,6 +2755,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_alnum)
 		is_utf8_alnum((U8*)"a");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 's':
 	    ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
@@ -2673,6 +2763,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_space)
 		is_utf8_space((U8*)" ");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'S':
 	    ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
@@ -2680,6 +2771,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_space)
 		is_utf8_space((U8*)" ");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'd':
 	    ret = reg_node(pRExC_state, DIGIT);
@@ -2687,6 +2779,7 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_digit)
 		is_utf8_digit((U8*)"1");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'D':
 	    ret = reg_node(pRExC_state, NDIGIT);
@@ -2694,11 +2787,13 @@
 	    nextchar(pRExC_state);
 	    if (UTF && !PL_utf8_digit)
 		is_utf8_digit((U8*)"1");	/* preload table */
+            Set_Node_Length(ret, 2); /* MJD */
 	    break;
 	case 'p':
 	case 'P':
 	    {	/* a lovely hack--pretend we saw [\pX] instead */
 		char* oldregxend = RExC_end;
+                char* parse_start = RExC_parse;
 
 		if (RExC_parse[1] == '{') {
 		    RExC_end = strchr(RExC_parse, '}');
@@ -2717,6 +2812,7 @@
 
 		RExC_end = oldregxend;
 		RExC_parse--;
+                Set_Node_Cur_Length(ret); /* MJD */
 		nextchar(pRExC_state);
 		*flagp |= HASWIDTH|SIMPLE;
 	    }
@@ -2739,6 +2835,7 @@
 		if (num > 9 && num >= RExC_npar)
 		    goto defchar;
 		else {
+                    char * parse_start = RExC_parse - 1; /* MJD */
 		    while (isDIGIT(*RExC_parse))
 			RExC_parse++;
 
@@ -2749,6 +2846,10 @@
 				   ? (LOC ? REFFL : REFF)
 				   : REF, num);
 		    *flagp |= HASWIDTH;
+                    
+                    /* override incorrect value set in reganode MJD */
+                    Set_Node_Offset(ret, parse_start+1); 
+                    Set_Node_Cur_Length(ret); /* MJD */
 		    RExC_parse--;
 		    nextchar(pRExC_state);
 		}
@@ -2779,6 +2880,7 @@
 	    register char *p;
 	    char *oldp, *s;
 	    STRLEN numlen;
+            char *parse_start = RExC_parse - 1;
 
 	    RExC_parse++;
 
@@ -2950,6 +3052,7 @@
 	    }
 	loopdone:
 	    RExC_parse = p - 1;
+            Set_Node_Cur_Length(ret); /* MJD */
 	    nextchar(pRExC_state);
 	    {
 		/* len is STRLEN which is unsigned, need to copy to signed */
@@ -3163,6 +3266,7 @@
     bool need_class = 0;
     SV *listsv;
     register char *e;
+    char *parse_start = RExC_parse; /* MJD */
     UV n;
 
     ret = reganode(pRExC_state, ANYOF, 0);
@@ -3820,6 +3924,18 @@
     NODE_ALIGN_FILL(ret);
     ptr = ret;
     FILL_ADVANCE_NODE(ptr, op);
+    if (RExC_offsets) {         /* MJD */
+      MJD_OFFSET_DEBUG((stderr, "%s: (op %s) %s %u <- %u (len %u) (max %u).\n", 
+              "reg_node",
+              reg_name[op],
+              RExC_emit - RExC_emit_start > RExC_offsets[0] 
+              ? "Overwriting end of array!\n" : "OK",
+              RExC_emit - RExC_emit_start,
+              RExC_parse - RExC_start,
+              RExC_offsets[0])); 
+      Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
+    }
+            
     RExC_emit = ptr;
 
     return(ret);
@@ -3844,6 +3960,17 @@
     NODE_ALIGN_FILL(ret);
     ptr = ret;
     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
+    if (RExC_offsets) {         /* MJD */
+      MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
+              "reganode",
+              RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
+              "Overwriting end of array!\n" : "OK",
+              RExC_emit - RExC_emit_start,
+              RExC_parse - RExC_start,
+              RExC_offsets[0])); 
+      Set_Cur_Node_Offset;
+    }
+            
     RExC_emit = ptr;
 
     return(ret);
@@ -3881,10 +4008,33 @@
     src = RExC_emit;
     RExC_emit += NODE_STEP_REGNODE + offset;
     dst = RExC_emit;
-    while (src > opnd)
+    while (src > opnd) {
 	StructCopy(--src, --dst, regnode);
+        if (RExC_offsets) {     /* MJD 20010112 */
+          MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 
+                  "reg_insert",
+                  dst - RExC_emit_start > RExC_offsets[0] 
+                  ? "Overwriting end of array!\n" : "OK",
+                  src - RExC_emit_start,
+                  dst - RExC_emit_start,
+                  RExC_offsets[0])); 
+          Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
+          Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
+        }
+    }
+    
 
     place = opnd;		/* Op node, where operand used to be. */
+    if (RExC_offsets) {         /* MJD */
+      MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
+              "reginsert",
+              place - RExC_emit_start > RExC_offsets[0] 
+              ? "Overwriting end of array!\n" : "OK",
+              place - RExC_emit_start,
+              RExC_parse - RExC_start,
+              RExC_offsets[0])); 
+      Set_Node_Offset(place, RExC_parse);
+    }
     src = NEXTOPER(place);
     FILL_ADVANCE_NODE(place, op);
     Zero(src, offset, regnode);
@@ -4098,6 +4248,16 @@
     if (r->reganch & ROPT_EVAL_SEEN)
 	PerlIO_printf(Perl_debug_log, "with eval ");
     PerlIO_printf(Perl_debug_log, "\n");
+    if (r->offsets) {
+      U32 i;
+      U32 len = r->offsets[0];
+      PerlIO_printf(Perl_debug_log, "Offsets: [%u]\n\t", r->offsets[0]);
+      for (i = 1; i <= len; i++)
+        PerlIO_printf(Perl_debug_log, "%u[%u] ", 
+                      r->offsets[i*2-1], 
+                      r->offsets[i*2]);
+      PerlIO_printf(Perl_debug_log, "\n");
+    }
 #endif	/* DEBUGGING */
 }
 
