Matching caret ([^]) using awk

I'm trying to find filename with "offending" characters.

All is good except for the caret character. I can't seem to get the regex pattern right.

The error I am getting is as follows:

awk: warning: escape sequence `\^' treated as plain `^'
awk: cmd. line:5: (FILENAME=- FNR=1) fatal: invalid regexp: Unmatched [, [^, [:, [., or [=: /[^]/

/bin/sh code segment is as follows:

	for pattern in "[']" '[`]' '["]' '[~]' '[!]' '[&]' '[$]' '[*]' '[?]' '[:]' '[;]' '[%]' '[,]' '[=]' '[@]' '[<]' '[>]' '[#]' '[+]' "[\^]"
	do
		LC_ALL=C awk -v pat=${pattern} '{

I've tried all the following:

"[\^]"
"[\\^]"
'[^]'
'[\^]'

I've even tried with adding double quotes around the "pat=" assignment, without success. Any ideas what I am doing wrong ?


This is my full script, warts and all:

#!/bin/sh

lookUpChar()
{
	awk '{ if( index($0,"|") != 0 ){ print $0 ; } ; }' <"${indxFile}" | sort --version-sort >"BAR_${indxFile}"

	{	awk '{ if( index($0,"{") != 0 ){ print $0 ; } ; }' <"${indxFile}" 
		awk '{ if( index($0,"}") != 0 ){ print $0 ; } ; }' <"${indxFile}"
        } | sort --version-sort | uniq	>"BRACE_${indxFile}"

	{	awk '{ if( index($0,"(") != 0 ){ print $0 ; } ; }' <"${indxFile}" 
		awk '{ if( index($0,")") != 0 ){ print $0 ; } ; }' <"${indxFile}"
        } | sort --version-sort | uniq	>"BRK_RND_${indxFile}"

	{	awk '{ if( index($0,"[") != 0 ){ print $0 ; } ; }' <"${indxFile}" 
		awk '{ if( index($0,"]") != 0 ){ print $0 ; } ; }' <"${indxFile}"
        } | sort --version-sort | uniq	>"BRK_SQ_${indxFile}"
}

cd ${index}

for drv in 2 3 4 5 6 7
do
	indxFile="DB001_F${drv}.d.INDEX.txt"

	#lookUpChar

	###	FUTURES:	^
	###	Not acceptable to awk:  '[^]'

	for pattern in "[']" '[`]' '["]' '[~]' '[!]' '[&]' '[$]' '[*]' '[?]' '[:]' '[;]' '[%]' '[,]' '[=]' '[@]' '[<]' '[>]' '[#]' '[+]' "[\^]"
	do
	
	#awk '{ if( $0 ~ /['\'';:"<>?~,`!@#$*%^&+=]/ ){ print $0 ; } ; }' <"${indxFile}" | sort --version-sort >"PUNCT_${indxFile}"
	#awk '/['\'';:"<>?~,`!@#$*%^&+=]/ { print $0 ; }' <"${indxFile}" | sort --version-sort >"PUNCT_${indxFile}"

	#LC_ALL=C awk -v pats="\`\"&'" '{

	LC_ALL=C awk -v pat=${pattern} '{
		n=length($0) ;
		#for( i=1 ; i < n-1 ; i++ ){
			#pat=substr( pats, i, 1 ) ;
			if( $0 ~ pat ){
				#printf("\t pat[%d] = \"%s\" |%s\n", i, pat, $0 ) | "cat 1>&2" ; 
				printf("\t pat = \"%s\" |%s\n", pat, $0 ) | "cat 1>&2" ; 
				printf("%s|%s\n", pat, $0 ) ;
				#break ;
			} ;
		#} ;
	}' <"${indxFile}"
	#}' <"${indxFile}" | sort --version-sort >"PUNCT_${indxFile}"
	done >"PUNCT_${indxFile}"
done | more
1 Like

Is it the same problem as here: awk: warning: escape sequence `\]' treated as plain `]' - Stack Overflow

Essentially, a regex passed as "foo" is parsed differently to /foo/?

Pretty sure that is not the case, because the square brackets have a special meaning if used between single quotes, namely, enumeration of literals.

What I'm trying to get awk to do is interpret the caret as a caret, and not as a position indicator as start of string. I've tried many different approaches of defining, and passing the string, outside or inside of awk, and can't seem to get the correct "phrasing" for proper interpretation. :frowning:

As I've never used awk, I'm out of my depth; but there are differences in escaping with '' and "" in shell scripts that are also at play; along with whatever awk does when it interprets a string. Have you also tried '[\\^]'?

4 Likes

Have you tried use ^ hex character code like \xHH or u\HH?

3 Likes

Thank you, Eugene, but for some reason, variations of those did not work for me.

Thank you, Thom. I looked at that ... but for some reason, I am just not grokking what that is trying to tell me.

Thank you, Stephen. I forgot to consider the extra backslash because I wasn't using the quotes in the awk assignment.

To keep the original problem script untouched for reference, I am including separately the updated, finalized script here.


Script "USER__Report_FilesWithBadCharacters.sh":
(Note: corrected)

#!/bin/sh

dbg=0
if [ "$1" = "--progress" ] ; then  dbg=1 ; fi

cd ${index}
test $? -eq 0 || { echo "\n\t Unable to set '${index}' as working directory.  Task abandoned.\n BYe! \n" ; exit 1 ; }

doFunction="doChar"


doBar()
{
	LC_ALL=C awk '{ if( index($0,"|") != 0 ){ print $0 ; } ; }' <"${indxFile}" >"BAR_${indxFile}"
	test -s "BAR_${indxFile}" || { rm -vf "BAR_${indxFile}" ; }
}

doBrace()
{
	LC_ALL=C awk -v verb=${dbg} '{ if( index($0,"{") != 0 || index($0,"}") != 0 ){
		if( verb == 1 ){ printf("\t pat = \"{|}\" |%s\n", $0 ) | "cat 1>&2" ;  } ;
			print $0 ;
		} ;
	}' <"${indxFile}" >"BRACE_${indxFile}"
	test -s "BRACE_${indxFile}" || { rm -vf "BRACE_${indxFile}" ; }
}

doBrktRnd()
{
	LC_ALL=C awk -v verb=${dbg} '{ if( index($0,"(") != 0 || index($0,")") != 0 ){
		if( verb == 1 ){ printf("\t pat = \"(|)\" |%s\n", $0 ) | "cat 1>&2" ;  } ;
			print $0 ;
		} ;
	}' <"${indxFile}" >"BRK_RND_${indxFile}"
	test -s "BRK_RND_${indxFile}" || { rm -vf "BRK_RND_${indxFile}" ; }
}

doBrktSqr()
{
	LC_ALL=C awk -v verb=${dbg} '{ if( index($0,"[") != 0 || index($0,"]") != 0 ){
		if( verb == 1 ){ printf("\t pat = \"[|]\" |%s\n", $0 ) | "cat 1>&2" ;  } ;
			print $0 ;
		} ;
	}' <"${indxFile}" >"BRK_SQR_${indxFile}"
	test -s "BRK_SQR_${indxFile}" || { rm -vf "BRK_SQR_${indxFile}" ; }
}

doChar()
{
	#for pattern in "[']" '[`]' '["]' '[~]' '[!]' '[&]' '[$]' '[*]' '[?]' '[:]' '[;]' '[%]' '[,]' '[=]' '[@]' '[<]' '[>]' '[#]' '[+]' "[\^]"
	#do

	LC_ALL=C awk -v verb=${dbg} -v pat=${pattern} '{
		n=length($0) ;
		if( $0 ~ pat ){
			if( verb == 1 ){ printf("\t pat = \"%s\" |%s\n", pat, $0 ) | "cat 1>&2" ;  } ;
			printf("%s|%s\n", pat, $0 ) ;
		} ;
	}' <"${indxFile}" >"PUNCT_${indxFile}"
	#}' <"${indxFile}" | sort --version-sort >"PUNCT_${indxFile}"
	#done >"PUNCT_${indxFile}"
	test -s "PUNCT_${indxFile}" || { rm -vf "PUNCT_${indxFile}" ; }
}

echo "
	Select the character for which to generate a report:

	 1	[']
	 2	[\`]
	 3	[\"]
	 4	[~]
	 5	[!]
	 6	[&]
	 7	[$]
	 8	[*]
	 9	[?]
	10	[:]
	11	[;]
	12	[%]
	13	[,]
	14	[=]
	15	[@]
	16	[<]
	17	[>]
	18	[#]
	19	[+]
	20	[^]

	90	[|]
	91	[{}]
	92	[()]
	93	[\[\]]
	
	Enter selection [1-20,90-93] => \c" ; read ans

if [ -z "${ans}" ] ; then  exit ; fi

case ${ans} in
	 1 ) pattern="[']" ;;
	 2 ) pattern='[`]' ;;
	 3 ) pattern='["]' ;;
	 4 ) pattern='[~]' ;;
	 5 ) pattern='[!]' ;;
	 6 ) pattern='[&]' ;;
	 7 ) pattern='[$]' ;;
	 8 ) pattern='[*]' ;;
	 9 ) pattern='[?]' ;;
	10 ) pattern='[:]' ;;
	11 ) pattern='[;]' ;;
	12 ) pattern='[%]' ;;
	13 ) pattern='[,]' ;;
	14 ) pattern='[=]' ;;
	15 ) pattern='[@]' ;;
	16 ) pattern='[<]' ;;
	17 ) pattern='[>]' ;;
	18 ) pattern='[#]' ;;
	19 ) pattern='[+]' ;;
	20 ) pattern='[\\^]' ;;

	90 ) doFunction="doBar" ;;
	91 ) doFunction="doBrace" ;;
	92 ) doFunction="doBrktRnd" ;;
	93 ) doFunction="doBrktSqr" ;;

	* )	echo "\n\t Invalid selection made.  Only valid choices:  [1-20] \n Bye!\n" ; exit 1
		;;
esac

for drv in 2 3 4 5 6 7
do
	indxFile="DB001_F${drv}.d.INDEX.txt"

	${doFunction}
done