; This file was edited using TECkitMappingEditorU.exe v1.0.0.1 on 2/13/2007.
;   Conversion Type = Legacy_to_from_Unicode
;   Left-hand side font = Courier New;12
;   Right-hand side font = Ezra SIL;15.75
;   Main Window Position = 0,0,1280,809
;   Code Point Window Position = 880,52,374,831

; SIL Hebrew Standard Encoding      SILEzratoUni50.map      copyright SIL 2003
; with support for Standard or Display Encoding conversion to Unicode
; and return conversion from Unicode to a modified Standard Encoding
; (Note that displayable text direction must be reversed before running this conversion!)

; initial draft by Peter Kirk, December 2000
; modified to use newer TECkit features, Jonathan Kew, January 2001
; tweaked for current compiler, JK, Sept 2002
; modified, Joan Wardell, Jan-April 2003
; modified for NEW ORDER rearrangement Joan Wardell, July 2003-August 2003
; added Return (Unicode -> SE) Joan Wardell, September 2003 Files will not be perfect. An 
; exact return trip is not possible because the information needed is no longer there.
; assistance from Jonathan Kew Oct 2003 
; modified for added characters and recommendations in Unicode 4.1 and 5.0 cjs April 2006-March 2007

EncodingName	"SIL-HEBREW_STANDARD-1997"					;jw
DescriptiveName	"SIL Hebrew Standard Encoding" 			;jw
Version	"58"								;jw
Contact	"mailto: sil_fonts@sil.org"
RegistrationAuthority	"SIL International"				;jw
RegistrationName "SILEzra" 

;LHSFlags (Consonant-Vowel-CantOrder)					;jw
;RHSFlags (Consonant-Vowel-CantOrder)					;jw

; None of the standard orders apply to biblical Hebrew. Visual order means
; the text has been reversed to appear correctly in applications that
; cannot handle right-to-left scripts. An example is Hebrew SE or DE
; displayed in Word (pre-Word 2000). Logical order means the text is
; in spoken order (approx), unreversed. Neither NFC nor NFD are appropriate
; choices for biblical Hebrew displayed with any Ezra SIL font.      ;jw

;Note that this program makes 4 passes now.  8/25/03 jw
;Note that this program makes 6 passes now.  cjs March 2007

;Initial pass to remove thinspaces, found in DE texts.		jw 4/14/03

pass (Byte)

255	> 	;thinspace deleted

;Pass to Split any plene vowels and do rearrangement  8/25/03
pass (Byte)

;attempting to list any mark possibly preceding a plene vowel (anything between consonant
; and plene, excluding dagesh, rafe, including any cant, masora, punctum, asterisk)

ByteClass	[any_mark_de] = (043 066 084 \
			       089 090 094 .. 096 106 126 149 \
			       152 .. 159 161 \
			       178 179 181 180 189 .. 193 242 \
				245 247 249 250 253 \
				036 037 061 068 077 080 .. 082 \
				085 086 126 128 .. 144  \
				174 175 177 182 184 .. 186 188 )

ByteClass	[vow_de] = (  252    204    217    232    248    205    218    237  \
			      233    206    219    239  \
			      225    207    220    240  \
			      243    221    236    241  \
			      105    199    214    230  \
			      069    197    212    227  \
			      101    198    213    229  \
			      097    194    209    222  \
			      065    195    210    223    111    196    211    224  \
			      079    200    215  \
			      117    201    216    231  )

;Split plene vowels (full spelling SE only, but possible)
;([any_mark_de]){0,2}=a 244 > 079 @a 119  ;sample syntax, note curly brackets! 8/22/03
;NOTE that * will delete all but the first occ, unless you use @ on the output side. 8/22/03

([any_mark_de]*)=a 203 > 069 @a 104
([any_mark_de]*)=a 228 > 065 @a 104
([any_mark_de]*)=a 235 > 101 @a 104
([any_mark_de]*)=a 246 > 079 @a 104

([any_mark_de]*)=a 202 > 069 @a 121
([any_mark_de]*)=a 234 > 101 @a 121
([any_mark_de]*)=a 238 > 105 @a 121

;Split holem-vav to holem [any mark] vav
([any_mark_de]*)=a 244 > 079 @a 119

@a 244 < 079 ([any_mark_de]){0,2}=a 119 

;move rafe before any de vowel - no reversal as it should have been before vowel anyway
[vow_de]=a 038=b > 038=b [vow_de]=a

;Convert DE data to SE, in preparation for Unicode conversion jw 8/14/03
pass (Byte)

ByteClass	[WS] 	= ( 009 010 013 032 176 )

ByteClass	[cons]	= (   039    083    087    072    098    103    100    104    119    122    120 \
			      088    121    107    108    109    110    115    118 \
			      112    099    113    114    083    116	) ;added all sins 10/6/03

;deo= DE Only (exludes SE) vowels
ByteClass	[patahs_deo] = (194 209 222)
ByteClass	[kamets_deo] = (195 210 223 196 111 211 224) ;also converts qamets_o, no longer avail
ByteClass	[hireqs_deo] = (199 214 230)
ByteClass	[segols_deo] = (198 213 229)
ByteClass	[tseres_deo] = (197 212 227)
ByteClass	[holems_deo] = (200 215)
ByteClass	[qibbs_deo] = (201 216 231)
ByteClass	[shewas_deo] = (204 217 232 248 205 218 237) ;also converts all silent shewa, no longer avail
ByteClass	[h_patahs_deo] = (207 220 240)
ByteClass	[h_kamets_deo] = (236 221 241)
ByteClass	[h_segols_deo] = (206 219 239)

ByteClass	[vow_de] = (  252    204    217    232    248    205    218    237  \
			      233    206    219    239  \
			      225    207    220    240  \
			      243    221    236    241  \
			      105    199    214    230  \
			      069    197    212    227  \
			      101    198    213    229  \
			      097    194    209    222  \
			      065    195    210    223    111    196    211    224  \
			      079    200    215  \
			      117    201    216    231  )

ByteClass	[meteg]	= ( 149 189 .. 193 )
ByteClass	[meteg_se]	= ( 149 149 149 149 149 149 )
ByteClass	[holem] 	= (079 200 215)
ByteClass	[holem_se] 	= (079 079 079)
ByteClass	[munah] 	= (158 90 249)
ByteClass	[dr_de]	= ( 067    070    071    073    074    075    076    183    208  \
			      038 )
ByteClass	[pp_de] = ( 077 132 138 152 155 )

;plene SE:                    ;plene parts SE:
;	244 holem-vav          ;	he = 104
;	251 shureq             ;	yod = 121
 
;	202 tsere-yod          ;	vav = 119
;	234 segol-yod
;	238 hiriq-yod          ;	hiriq = 105
                               ;	tsere = 69
;	203 tsere-he           ;	qamats = 65
;	228 qamats-he          ;	segol = 101
;	235 segol-he           ;	holem = 079
;	246 holem-he

;Split hataf-metegs NO! need to preserve these for later special treatment cjs April 2006
;254	> 225 149 
;102	> 233 149
;but the reverse is needed in the back conversion cjs May 2006
;NO! we have to sacrifice the distinction between medial and left in the back conversion to get pre-positive cants right
;225 149 < 254
;233 149 < 102

;Move DE data to SE
;Dagesh
183 > 208
67 > 208
70 > 208
71 > 208
73 > 208
74 > 208
75 > 208
76 > 208
;Accent
180 > 181
;Asterisk
43 > 42 ;(high asterisk no longer available, converting to regular asterisk)

;High Cants
174 > 128
175 > 129
177 > 130
182 > 131
184 > 132
185 > 133
186 > 134
188 > 135
68 > 136
61 > 137
77 > 138
85 > 139
80 > 140
81 > 141
82 > 142
86 > 144

;Low Cants
189 > 149
190 > 149
190 > 149
191 > 149
192 > 149
193 > 149
96 > 152
95 > 153
106 > 154
89 > 155
247 > 155
66 > 156
245 > 156
84 > 157
90 > 158
249 > 158
94 > 159
250 > 159
242 > 161
253 > 161

;Convert DE Vowels to SE
[patahs_deo] > 097
[kamets_deo] > 065
[hireqs_deo] > 105
[segols_deo] > 101
[tseres_deo] > 069
[holems_deo] > 079
[qibbs_deo] > 117
[shewas_deo] > 252
[h_patahs_deo] > 225
[h_kamets_deo] > 243
[h_segols_deo] > 233

;Convert shureq to vav+dagesh
251 > 119 208
;rearrange pre-positive metheg on shureq to follow - word-initial only - only needed for de text cjs May 2006
;de text sometimes has both pre-pos cant and metheg before word-initial shureq
149 251 / ([WS] | #) [pp_de]? _ > 119 208 149

;Split DE Finals combinations into intermediate state for now
167 > 107 65
168 > 110 65
169 > 112 208
170 > 107 252
172 > 107 208
173 > 107 208 65

;108 [meteg] [holem] [munah] > 108 079 149 158  ;rearrange lamed-meteg-holem to lamed-holem-meteg.
; need to find out where holem is in relation to right meteg. L-met-O-munah 1 occ, O95-11 occs which are 
; encoded as meteg-holem in Ezra SIL SE/DE. 
; No need to use PUA right meteg, just get order correct: meteg after O-holem. 8/14/03
;TEST REMOVAL. Keep. No, try class rule below to get all occs. 8/25/03
;108 191 215 90 > 108 079 149 158  ;this is 1 occ. Deut 5:8:1, LO9574 jw 4/1/03
[meteg]=a [holem]=b / [cons] [dr_de]? _ > [holem]=b [meteg]=a
;Not possible to do return trip to 95, there is no context and there are 196 occs of meteg(75) with holem.
;These look identical in Unicode. This conversion table makes it so you cannot have a right meteg on holem. 
;That doesn't make sense anyway.

;RETURN TRIP

; 'Chapter'  < 'Ðhapter'
;compiler rejects quoted strings so change to
67 104 97 112 116 101 114 < 208 104 97 112 116 101 114	;added to make data processed through our cc tables a bit cleaner.


;*********************************************************************************************************************
;Pass to convert from byte (as in, ASCII & upper ASCII Ezra SIL encoding) data to Unicode. 
;Note that this pass originally was written
;to handle either DE or SE data. At present, no DE data should be passing through here
;but I'm not certain. 8/25/03 jw

pass (Byte_Unicode)								;jw

ByteDefault		063				; question mark
UniDefault		replacement_character	;FFFD

; in the mapping rules
;	<>	means a bidirectional rule
;	>	means only byte->Unicode
;	<	means only Unicode->byte

; General order of encoding in SIL encoding {parens() = optional}:
; consonant (dagesh/rafe)(vowel point)(meteg|cantillation).
; Exceptions: Right meteg comes before vowel point.
; Prepositive accents come before word-initial consonant.
; In the few cases where meteg and cantillation co-occur, the order is not fixed.
; Cantillation can be either high-low or low-high, or mixed. It is not fixed.

; CLASS DEFINITIONS

; there are separate namespaces for Byte and Unicode classes,
; allowing us to use the same name for classes with corresponding content

; control characters
ByteClass	[CTL]	= (   0x00 .. 0x1f     0x7f )
UniClass	[CTL]	= ( U+0000 .. U+001f U+007f )

; ASCII characters, excluding space
ByteClass	[ascii]	= (   0x21 ..   0x7e )
UniClass	[ascii]	= ( U+0021 .. U+007e )

; alphanumeric ASCII characters, legal in SF markers
ByteClass	[anum]	= (   0x30 ..   0x39   0x41 ..   0x5A   0x61 ..   0x7A )
UniClass	[anum]	= ( U+0030 .. U+0039 U+0041 .. U+005A U+0061 .. U+007A )

; numeric ASCII characters, found in verse numbers  jw 3/13/03 
ByteClass	[num]	= (   048 ..   057 )
UniClass	[num]	= ( U+0030 .. U+0039 )  ;uncommented 7/24/03 jw

; whitespace (excluding 255 thin space which is used for spacing within words)
ByteClass	[WS] 	= ( 009 010 013 032 176 ) ; deleted 160, can't remember what it is! 7/24/03 jw
UniClass	[WS] 	= ( U+0009 U+000A U+000D U+0020 U+00A0 ) ;7/24/03 jw

; whitespace plus word dividing punctuation (including maqqef) and stream start/end 
;8/22/03 changed name from WSP to OOB for OUT OF BOUNDS. This more clearly reflects what is contained here - 
;any mark which is not Hebrew, or would legitimately force a non-final to become final, such as numbers. This
;is also used to identify pre-positive marks. 8/25/03
ByteClass	[OOB]	= ( 009 010 013 032 .. 035 040 041 042 044 .. 047 058 059 060 062 063 \
			    091 092 093 123 124 125 150 151 160 176 145 .. 148 171 187 048 ..057);jw added quotes, numbers, dashes, setuma/petuha
;copied here from last pass 9/12/03 jw,used for RETURN context, may not match above
UniClass	[OOB] 	= ( U+0009 U+000A U+000D U+0020 U+00A0 U+0020 .. U+0040 U+05F3 U+05F4 \
				U+05BE U+05C0 U+05C3 U+005B .. U+0060 \
				U+007B .. U+007E U+00A7 U+00AB U+00AF \
				U+00B6 U+00BB U+00BF U+2000 .. U+200A U+2010 .. U+2021 \
				U+2039 U+203A )
	
; all Hebrew letters etc in Unicode
;UniClass	[heb]	= ( U+0591 .. U+05F4 U+FB1D .. U+FB4F )

; consonants (in Hebrew alphabetical order including unpointed shin, no non-final or final forms, jw; excl sin, shin w/dots, non-finals)
;ByteClass	[cons]	= (   039    098    103    100    104    119    122    120 \
;			      088    121    107    108    109    110    115    118 \
;			      112    099    113    114    083    116	) 
UniClass	[cons]	= ( U+05D0 U+05D1 U+05D2 U+05D3 U+05D4 U+05D5 U+05D6 U+05D7 \
			    U+05D8 U+05D9 U+05DB U+05DC U+05DE U+05E0 U+05E1 U+05E2 \
			    U+05E4 U+05E6 U+05E7 U+05E8 U+05E9 U+05EA )

; non-final forms of consonants with final forms
ByteClass	[nonf]	= (   107    109    110    112    099  )
UniClass	[nonf]	= ( U+05DB U+05DE U+05E0 U+05E4 U+05E6 )

; final forms of consonants
UniClass	[final]	= ( U+05DA U+05DD U+05DF U+05E3 U+05E5 )

; all consonants, including shin/sins, nonfinal, Display Encoding final forms, plene forms of vowels. Should be used in any context searching for consonants or forms that take space on the baseline. Do not use for conversion. ;jw  4/28/03 9/18/03 deleted line 5 duplicates
ByteClass	[allcons] = (039    098    103    100    104    119    122    120 \
			      088    121    107    108    109    110    115    118 \
			      112    099    113    114    083    116	\
			      072    087 \
			      162 163 164 165 166 167 168 169 170 172 173 \
			      244 251 \
			      202 203 228 234 235 238 246 )

; meteg - all DE varieties
ByteClass	[meteg]	= ( 149 189 .. 193 )

; all points
ByteClass	[point]	= ( 036 .. 038 043 061 065 .. 071 073 .. 077 079 .. 082 084 .. 086 \
			    089 090 094 .. 097 101 102 105 106 111 117 126 128 .. 144 149 \
			    152 .. 159 161 174 175 177 .. 186 188 .. 201 204 .. 225 \
			    227 229 .. 233 236 237 239 .. 243 245 247 .. 250 252 .. 254 ) ;jw
;added punctum 178-179

; prepositive accents in SE and their Unicode equivalents
ByteClass	[pp_se]	= (  155    138    152    132  );jw chg 77 to 138 geresh SE
UniClass	[pp_se]	= ( U+05AD U+059D U+059A U+05A0 )

; prepositive accents in DE and their Unicode equivalents
;ByteClass	[pp_de]	= (   155    089    247    077    138    152    096    132    184  )
;UniClass	[pp_de]	= ( U+05AD U+05AD U+05AD U+059D U+059D U+059A U+059A U+05A0 U+05A0 )
; not sure if it is permitted to have duplicate class members
; but it is useful here in a class which appears only as output of rule
; indeed the following table depends on this, would be much more complicated otherwise

; vowel points in SE and their Unicode equivalents
ByteClass	[vow_se]	= ( 252 233 225 243 105 069 101 097 065 079 117 )
UniClass	[vow_se]	= ( U+05B0 .. U+05B9 U+05BB )

; vowel points in DE and their Unicode equivalents (one line per vowel)
; note that SE silent shewa and qamets o are treated as DE only here
ByteClass	[vow_de] = (   252    204    217    232    248    205    218    237  \
			      233    206    219    239  \
			      225    207    220    240  \
			      243    221    236    241  \
			      105    199    214    230  \
			      069    197    212    227  \
			      101    198    213    229  \
			      097    194    209    222  \
			      065    195    210    223    111    196    211    224  \
			      079    200    215  \
			      117    201    216    231  )
UniClass	[vow_de] = ( U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 \
			    U+05B1 U+05B1 U+05B1 U+05B1 \
			    U+05B2 U+05B2 U+05B2 U+05B2 \
			    U+05B3 U+05B3 U+05B3 U+05B3 \
			    U+05B4 U+05B4 U+05B4 U+05B4 \
			    U+05B5 U+05B5 U+05B5 U+05B5 \
			    U+05B6 U+05B6 U+05B6 U+05B6 \
			    U+05B7 U+05B7 U+05B7 U+05B7 \
			    U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 \
			    U+05B9 U+05B9 U+05B9 \
			    U+05BB U+05BB U+05BB U+05BB )

; dagesh and rafe in SE and their Unicode equivalents
ByteClass	[dr_se]	= (   208    038  )
UniClass	[dr_se]	= ( U+05BC U+05BF )

; dagesh and rafe in DE and their Unicode equivalents
ByteClass	[dr_de]	= (   067    070    071    073    074    075    076    183    208  \
			      038 )
UniClass	[dr_de]	= ( U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC \
			    U+05BF )

; shin and sin
ByteClass	[shs]	= (   072    087	)

; shin and sin dots
UniClass [shindots] = ( U+05C1 U+05C2 )

;start storage by jw

; low vowels CCAT
; A F I E " U : :a :f :e
;ByteClass	[low_vow_de] = (252    204    217    232    248    205    218    237  \
;			      233    206    219    239  \
;			      225    207    220    240  \
;			      243    221    236    241  \
;			      105    199    214    230  \
;			      069    197    212    227  \
;			      101    198    213    229  \
;			      097    194    209    222  \
;			      065    195    210    223    111    196    211    224  \
;			      117    201    216    231  )
;UniClass	[low_vow_de] = (U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 \
;			    U+05B1 U+05B1 U+05B1 U+05B1 \
;			    U+05B2 U+05B2 U+05B2 U+05B2 \
;			    U+05B3 U+05B3 U+05B3 U+05B3 \
;			    U+05B4 U+05B4 U+05B4 U+05B4 \
;			    U+05B5 U+05B5 U+05B5 U+05B5 \
;			    U+05B6 U+05B6 U+05B6 U+05B6 \
;			    U+05B7 U+05B7 U+05B7 U+05B7 \
;			    U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 \
;			    U+05BB U+05BB U+05BB U+05BB )

; low cants CCAT
; 35 70 71 72 73 74 75 91 92 93 94 95

ByteClass	[low_p_se] = (149 \
			    152 .. 159 161 \
			    179 179 ) 

;changed U+0323 to U+05C5 lower dot cjs April 2006
UniClass	[low_p_se] = (U+05BD \
				U+05A4 U+059B U+05AA U+0596 U+05A5 U+05A6 U+05A3 U+05A7 U+0591 \
				 U+05C5 U+0323 )


;ByteClass	[high_vow_de] = (079 200 215)
;UniClass	[high_vow_de] = (U+05B9 U+05B9 U+05B9)

; high cants CCAT
; 24 33 44 60 61 62 63 64 65 80 81 82 83 84 85

ByteClass	[high_p_se] = (036 037  \
				126 128 .. 144  \
				178 )

;RE--ADDING THESE TWO GROUPS FOR HOLEM VAV RETURN PROCESSING!  9/12/03 jw
ByteClass	[any_p_se] = (	036 037  \
				126 128 .. 144  \
				149 \
				152 .. 159 161 \
			    	178 179 179 )

;changed U+0323 to U+05C5 lower dot
UniClass	[any_p_se] = (	U+0307 U+0308  \
				U+05AF U+0597 U+0594 U+0592 U+05A9 U+05A0 U+059F U+05AB \
				U+05A1 U+0595 U+05A8 U+059C U+059E U+05AC U+0598 U+0593 U+0599 U+05AE \
				U+05BD \
				U+05A4 U+059B U+05AA U+0596 U+05A5 U+05A6 U+05A3 U+05A7 U+0591 \
				U+05C4 U+05C5 U+0323 )

ByteClass	[holem] = (079 200 215)	;jw 4/17/03

ByteClass	[munah] = (158 90 249)

Define		ZWNJ U+200C	;jw 8/4/03
Define		ZWJ  U+200D
Define		CGJ  U+034F

ByteClass	[hiriq] = (105 199 214 230)	;jw 12/10/03

; START OF ACTUAL CONVERSIONS

; control characters
[CTL]	<> [CTL] 

; preserve standard format markers of backslash followed by string of alphanumeric ASCII characters
;'\' / _ [anum]		<> reverse_solidus / _ [anum]
;[anum] / '\' [anum]* _	<> [anum] / reverse_solidus [anum]* _

;compiler rejects quoted strings so change to
92 / _ [anum]		<> reverse_solidus / _ [anum]
[anum] / 92 [anum]* _	<> [anum] / reverse_solidus [anum]* _


; Cantillation marks

161 	<> U+0591	; CCAT 92	Hebrew accent ETNAHTA
242 	 > U+0591	;			ditto DE
253 	 > U+0591	;			ditto DE
130	<> U+0592	; CCAT 01	Hebrew accent (SEGOL) SEGOLTA (postpositive)
177	 > U+0592	;			ditto DE
142	<> U+0593	; CCAT 65	Hebrew accent SHALSHELET
082	 > U+0593	;			ditto DE
129	<> U+0594	; CCAT 80	Hebrew accent ZAQEF QATAN
175	 > U+0594	;			ditto DE
136	<> U+0595	; CCAT 85	Hebrew accent ZAQEF GADOL
068	 > U+0595	;			ditto DE
155 	<> U+0596	; CCAT 73	Hebrew accent TIPEHA = tarha   
089	 > U+0596	;			ditto DE
247	 > U+0596	;			ditto DE
; Same SIL codes as DEHI, TIPEHA is not word initial
; In word initial environment 155, 089, 247 > DEHI, see below
128	<> U+0597	; CCAT 81	Hebrew accent REVIA
174	 > U+0597	;			ditto DE
141	<> U+0598	; CCAT 82	Hebrew accent ZARQA = zinorit
081	 > U+0598	;			ditto DE
143	<> U+0599	; CCAT 03,33	Hebrew accent PASHTA (postpositive) (left)

; CCAT 03 is word final; CCAT 33 is not word final but to left of letter ***
; except when followed by holem, when it centers. jw 3/12/03
;[holem] 143 / _ ^039 > U+05B9 U+05A8 ; any holem with 33, not followed by aleph (039) goes from left to medial

;108 [dr_de]?=c [vow_de]?=b  143 / _ [allcons] > U+05DC [dr_de]?=c [vow_de]?=b U+05A8 ; lamed with 33, not word-final  12/10/03 changed order of dagesh, both sides jw

;108 [dr_de]?=c [vow_de]?=b  143 / _ ([allcons] | [hiriq]) > U+05DC [dr_de]?=c [vow_de]?=b U+05A8 
; lamed with 33, not word-final  12/10/03 changed order of dagesh, both sides, corrected context
; of / _ [allcons]. This rule needs to also include all forms of LAIM, without picking up wordfinal 03. jw

; this special code for pashta/azla is not needed becasue it is now handled in the font, so it is commented out cjs March 2007
; trying to get pashta/azla to match better on return... 
; For this pair, I will just convert back to shape, not original encoding. Getting back to the original isn't possible,
; without determining every possible context, and there may not be a general context anyway. If I send them back
; to the correct shape, at least the text should look correct, even though the encoding is not identical.  9/22/03 jw

;Change to 05A8 when sin dot collision forces to medial version 05A8 jw 4/28/03, changed order of dagesh 10/6/03 jw
;087  [dr_de]?=c [vow_de]?=b  143 / _ [allcons] > U+05E9 U+05C2 [dr_de]?=c [vow_de]?=b  U+05A8 ;sin w/dot and 33, also not word-final jw

;152	<> U+059A	; CCAT 10	Hebrew accent YETIV (prepositive);uncommented 8/21/03 recom 8/22/03
; Same SIL code as MAHAPAKH, YETIV is word initial
; Converted by special prepositive conversion code below
153	<> U+059B	; CCAT 91	Hebrew accent TEVIR
095	 > U+059B	;			ditto DE
138	<> U+059C	; CCAT 61	Hebrew accent GERESH
077	 > U+059C	;			ditto DE
; Same SIL codes as GERESH MUQDAM, GERESH is not word initial
; In word initial environment 138, 077 > GERESH MUQDAM, see below
;138	<> U+059D	; CCAT 11	Hebrew accent GERESH MUQDAM (prepositive)uncom 8/21/03 recom 8/22/03
;077	 > U+059D	;			ditto DE uncom 8/21/03
; Same SIL codes as GERESH, GERESH MUQDAM is word initial
; Converted by special prepositive conversion code below
139	<> U+059E	; CCAT 62	Hebrew accent GERSHAYIM
085	 > U+059E	;			ditto DE
133	<> U+059F	; CCAT 84	Hebrew accent QARNEY PARA
185	 > U+059F	;			ditto DE
132	<> U+05A0	; CCAT 14,44	Hebrew accent TELISHA GEDOLA (prepositive) uncom 8/21/03 recom 8/22/03
;184	 > U+05A0	;			ditto DE 				for consistency uncom 9/18/03 need this for Gen 5:29:5 zeh with medial prepos. jw   Okay commented 10/6/03
; CCAT 14 is usually word initial; CCAT 44 is not word initial, occurs Gen 5:29:5,
; Ezra 5:17, Esth 6:13 jw 3/18/03
; ***  See also gen 7:7:2nd line
; Converted by special prepositive conversion code below
135	<> U+05A1	; CCAT 83	Hebrew accent PAZER
188	 > U+05A1	;			ditto DE
158 	<> U+05A3	; CCAT 74	Hebrew accent MUNAH
090	 > U+05A3	;			ditto DE
249 	 > U+05A3	;			ditto DE
152	/ [OOB] _ 	> U+059A  ;8/22/03 This code determines which YETIB to use. Keep. 
152	/ # _ 		> U+059A  ;8/22/03 no DE should be coming through at this point.
152	<> U+05A4	; CCAT 70	Hebrew accent MAHAPAKH medial version
096	 > U+05A4	;			ditto DE medial version
; Same SIL codes as YETIV, MAHAPAKH is not word initial
; In word initial environment 152, 096 > YETIV, see above
156 	<> U+05A5	; CCAT 71	Hebrew accent MERKHA = yored
066	 > U+05A5	;			ditto DE
245	 > U+05A5	;			ditto DE
157	<> U+05A6	; CCAT 72	Hebrew accent MERKHA KEFULA
084	 > U+05A6	;			ditto DE
159 	<> U+05A7	; CCAT 94	Hebrew accent DARGA
094	 > U+05A7	;			ditto DE
250	 > U+05A7	;			ditto DE
137	<> U+05A8	; CCAT 63	Hebrew accent QADMA = azla medial
061	 > U+05A8	;			ditto DE

; CCAT 04 is word final; CCAT 24 (rare) is not word final but to left of letter ***
131 /    _ [OOB] <> U+05A9       ; CCAT 04	Hebrew accent TELISHA QETANA (postpositive)
131              <> U+05A9 ZWNJ  ; CCAT 24	Hebrew accent TELISHA QETANA (non-postpositive)
131 / 79 _ 39    <> U+05A9       ;exception for Est 6.13.11 to allow holem to move on to the aleph

182 /    _ [OOB] <> U+05A9       ; ditto DE
182              <> U+05A9 ZWNJ  ; 


154	<> U+05AA	; CCAT 93	Hebrew accent YERAH BEN YOMO = galgal
106	 > U+05AA	;			ditto DE
; new character 05A2 equivalent to 05AA, not used in BHS 
154	<  U+05A2
134	<> U+05AB	; CCAT 60	Hebrew accent OLE
186	 > U+05AB	;			ditto DE
140	<> U+05AC	; CCAT 64	Hebrew accent ILUY
080	 > U+05AC	;			ditto DE
;155	<> U+05AD	; CCAT 13	Hebrew accent DEHI (prepositive) uncom 8/21/03 recom 8/22/03
; Same SIL code as TIPEHA, DEHI is word initial
; Converted by special prepositive conversion code below
144	<> U+05AE	; CCAT 02	Hebrew accent ZINOR (postpositive)
086	 > U+05AE	;			ditto DE
126	<> U+05AF	; 	 	Hebrew mark MASORA CIRCLE

; 7/30/03
; TEST rearrangement back for mixed high/low marks. 
; worked above Exod20:3:2, not below or above for 20:2:3 7/31/03 
; these 2 examples are in conflict. One needs to be converted, but not the other.
; Both are LOLOHI in Unicode at present.  Note this is Reverse direction.
;[vow_se]=a  [high_p_se]=c [low_p_se]=b < [vow_se]=a [low_p_se]=b [high_p_se]=c
;[vow_se]=a  [low_p_se]=b [high_p_se]=c < [vow_se]=a [high_p_se]=c [low_p_se]=b


;******************************************************************************************************
;WORKING AREA  8/14/03


; holemvav with preceding cants OK 8/14/03
; divine name OK as is 8/15/03
; legal second vowels: hiriq,sheva,patah-only with LAIM OK
; metegs 
; metegs precedence over standard problem OK
; paired cants which change encoding 33-03 OK, not sure about others.



;******************************************************************************************************
;Finals processing and rearrangement - 3 possible environments 8/12/03
;Ezra standard encoding contains no finals.
;Unicode final form always mapped to Ezra standard form, final forms are DE only
;[nonf] / _ [point]* [WSP] > [final]
;[nonf] / _ [point]* #	> [final]
;[nonf] / _ [point]* [num]+ > [final]   	;jw 3/13/03 OOB now catches finals preceding verse numbers with 
;fourth possibility cjs April 2006
;[nonf] / [WSP] _ [point]* [WSP] > [nonf]
; no whitespace. See Exod 20:4:final, Exod 20:9:final.

;This is WORD-FINAL environment. (This should not hit if nonf followed by 226-ZWL, which is what we want for preventing finals processing.)
;no return needed. Finals are not part of SE encoding. 9/11/03
;in decalogue final kaph is followed by two high cants so change to {0,2} cjs April 2006
[nonf]=b / _  [dr_se]{0,2} [vow_se]? [low_p_se]? [high_p_se]{0,2} [low_p_se]? ( [OOB] | # )  <>  [final]=b 

;to protect the free form of preposition kaph, added by cjs April 2006, and also pethuhah, modified by cjs Feb 2007
[nonf]=b  / ( [OOB] | # ) _ [dr_se]{0,2} [vow_se]? [low_p_se]? [high_p_se]{0,2} [low_p_se]? ( [OOB] | # ) <>  [nonf]=b 


;This is WORD-INITIAL environment. 
;is return needed? yes 9/12/03
 [pp_se]=a / [OOB] _ <> [pp_se]=a / [OOB] _ 	 ;8/25/03, 9/11/03 jw
 [pp_se]=a / # _     <> [pp_se]=a / # _		 ;8/25/03, 9/11/03 jw

;NOTE pp_se MUST be used with WhiteSpace/OOB environments. Otherwise, you will have marks identified
;as word-initial which are not. We use the same mark in SE for 2 items in Unicode. Example: 05AD 0596

;This is also very tricky. In the case where you might have a hilo or lohi cant combination, you want to catch both. Two lines of code won't work, because they would be the same length and ? matches zero. Therefore, both lohi and hilo would activate the original command below. NEXT LINE COMMENTED on purpose.
;[pp_se]=a? [cons]=b [dr_se]=c? [vow_se]=f? [low_p_se]=g? [high_p_se]=h? >  [cons]=b [dr_se]=c [vow_se]=f [low_p_se]=g [pp_se]=a [high_p_se]=h
;Basically, in a hilo situation, this command would assume the first lo was optional, run the command, and drop out for the final lo mark. To get around this, I've combined the two commands by saying optional lohilo. This seems to work. See
;nonf processing above and elsewhere.


;******************************************************************************************************
; Points and punctuation

252	<> U+05B0	; CCAT :	Hebrew point SHEVA
204	 > U+05B0	;			ditto DE
217	 > U+05B0	;			ditto DE
232	 > U+05B0	;			ditto DE
233	<> U+05B1	; CCAT :E	Hebrew point HATAF SEGOL
206	 > U+05B1	;			ditto DE
219	 > U+05B1	;			ditto DE
239	 > U+05B1	;			ditto DE
225	<> U+05B2	; CCAT :A	Hebrew point HATAF PATAH
207	 > U+05B2	;			ditto DE
220	 > U+05B2	;			ditto DE
240	 > U+05B2	;			ditto DE
243	<> U+05B3	; CCAT :F	Hebrew point HATAF QAMATS
221	 > U+05B3	;			ditto DE
236	 > U+05B3	;			ditto DE
241	 > U+05B3	;			ditto DE
105	<> U+05B4	; CCAT I	Hebrew point HIRIQ
199	 > U+05B4	;			ditto DE
214	 > U+05B4	;			ditto DE
230	 > U+05B4	;			ditto DE
069	<> U+05B5	; CCAT "	Hebrew point TSERE
197	 > U+05B5	;			ditto DE
212	 > U+05B5	;			ditto DE
227	 > U+05B5	;			ditto DE
101	<> U+05B6	; CCAT E	Hebrew point SEGOL
198	 > U+05B6	;			ditto DE
213	 > U+05B6	;			ditto DE
229	 > U+05B6	;			ditto DE
097	<> U+05B7	; CCAT A	Hebrew point PATAH
194	 > U+05B7	;			ditto DE
209	 > U+05B7	;			ditto DE
222	 > U+05B7	;			ditto DE
065	<> U+05B8	; CCAT F	Hebrew point QAMATS
195	 > U+05B8	;			ditto DE
210	 > U+05B8	;			ditto DE
223	 > U+05B8	;			ditto DE
; new character 05C7 equivalent to 05B8, not used in BHS
065	<  U+05C7
079	<> U+05B9	; CCAT O	Hebrew point HOLEM
200	 > U+05B9	;			ditto DE
215	 > U+05B9	;			ditto DE
117	<> U+05BB	; CCAT U	Hebrew point QUBUTS
201	 > U+05BB	;			ditto DE
216	 > U+05BB	;			ditto DE
231	 > U+05BB	;			ditto DE
208	<> U+05BC	; CCAT .	Hebrew point DAGESH or MAPIQ = shuruq
067	 > U+05BC	;			ditto DE
070	 > U+05BC	;			ditto DE
071	 > U+05BC	;			ditto DE
073	 > U+05BC	;			ditto DE
074	 > U+05BC	;			ditto DE
075	 > U+05BC	;			ditto DE
076	 > U+05BC	;			ditto DE
183	 > U+05BC	;			ditto DE
149	<> U+05BD	; CCAT 35 (with hataf vowels)  
 			; CCAT 75 (default)
 			; CCAT 95 (word-initial) ;jw modified note from "prepositive"
 			;		Hebrew point METEG = siluq
189	 > U+05BD	;			ditto DE
190	 > U+05BD	;			ditto DE
191	 > U+05BD	;			ditto DE
192	 > U+05BD	;			ditto DE
193	 > U+05BD	;			ditto DE


; special processing BACK of word-initial meteg, similar to that of prepositive accents above 
;jw modified meteg notes. Meteg is never word-initial and does not even precede holem, (exc. in 11 occs of 95-O)
; MC encoding for regular meteg is 75.
; MC encoding for right meteg is 95. Right meteg is accomplished by placing right meteg before the low vowel. Left meteg on
; hatafs must be CGJ followed by regular meteg. 7/24/03 jw 
;Left meteg was not encodable after hatafs in SE. It is placed on left 
;but will be converted to central (regular) meteg if converted to DE jw 8/5/03
149 < CGJ U+05BD				; jw 8/12/03 LEFT METEG NEW ORDER
149 < U+F303					; jw 8/5/03 old PUA LEFT METEG
149 < U+F302					; jw 10/6/03 old PUA RIGHT METEG

045	<> U+05BE	; CCAT -	Hebrew punctuation MAQAF
038	<> U+05BF	; CCAT ,	Hebrew point RAFE
124	<> U+00A0 U+05C0	; CCAT 05	Hebrew punctuation PASEQ = legarmeh jw added NBSP for better spacing 3/31/03
;	<> U+05C1	;		Hebrew point SHIN DOT
;	<> U+05C2	;		Hebrew point SIN DOT
058	<> U+05C3	; CCAT 00	Hebrew punctuation SOF PASUQ
178	<> U+05C4	; CCAT 52	Hebrew mark UPPER DOT 05C4(changed to 0307 8/15/03, changed back 9/9/03)jw
178	<  U+0307	; CCAT 52	Hebrew mark UPPER DOT 05C4(changed to 0307 8/15/03, changed back 9/9/03)jw

; Based on ISO 8859-8

039	<> U+05D0	; CCAT )	Hebrew letter ALEF = aleph
098	<> U+05D1	; CCAT B	Hebrew letter BET
103	<> U+05D2	; CCAT G	Hebrew letter GIMEL
100	<> U+05D3	; CCAT D	Hebrew letter DALET
104	<> U+05D4	; CCAT H	Hebrew letter HE
119	<> U+05D5	; CCAT W	Hebrew letter VAV
122	<> U+05D6	; CCAT Z	Hebrew letter ZAYIN
120	<> U+05D7	; CCAT X	Hebrew letter HET
088	<> U+05D8	; CCAT +	Hebrew letter TET
121	<> U+05D9	; CCAT Y	Hebrew letter YOD
162	 > U+05DA	; CCAT K	Hebrew letter FINAL KAF 
107	<  U+05DA	; CCAT K	Hebrew letter FINAL KAF  7/23/03 jw
; see code below for processing of final forms
107	<> U+05DB	; CCAT K	Hebrew letter KAF
108	<> U+05DC	; CCAT L	Hebrew letter LAMED
163	 > U+05DD	; CCAT M	Hebrew letter FINAL MEM
109	<  U+05DD	; CCAT M	Hebrew letter FINAL MEM 7/23/03 jw
; see code below for processing of final forms
109	<> U+05DE	; CCAT M	Hebrew letter MEM
164	 > U+05DF	; CCAT N	Hebrew letter FINAL NUN
110	<  U+05DF	; CCAT N	Hebrew letter FINAL NUN 7/23/03 jw
; see code below for processing of final forms
110	<> U+05E0	; CCAT N	Hebrew letter NUN
;115	<> U+05E1	; CCAT S	Hebrew letter SAMEKH moved 7/23/03 jw
118	<> U+05E2	; CCAT (	Hebrew letter AYIN
165	 > U+05E3	; CCAT P	Hebrew letter FINAL PE
112	<  U+05E3	; CCAT P	Hebrew letter FINAL PE 7/23/03 jw
; see code below for processing of final forms
;112	<> U+05E4	; CCAT P	Hebrew letter PE  moved 7/23/03 jw
166	 > U+05E5	; CCAT C	Hebrew letter FINAL TSADI
099	<  U+05E5	; CCAT C	Hebrew letter FINAL TSADI 7/23/03 jw
; see code below for processing of final forms
099	<> U+05E6	; CCAT C	Hebrew letter TSADI = zade
113	<> U+05E7	; CCAT Q	Hebrew letter QOF
114	<> U+05E8	; CCAT R	Hebrew letter RESH
083	<> U+05E9	; CCAT #	Hebrew letter SHIN (unpointed)		
072	<> U+05E9 U+05C1;		Hebrew letter SHIN with SHIN DOT   jw
087	<> U+05E9 U+05C2;		Hebrew letter SHIN with SIN DOT    jw
116	<> U+05EA	; CCAT T	Hebrew letter TAV

; Yiddish digraphs

119 119	<  U+05F0	; CCAT WW	Hebrew ligature YIDDISH DOUBLE VAV = tsvey vovn
			;			(not used in Hebrew)
119 121	<  U+05F1	; CCAT WY	Hebrew ligature YIDDISH VAV YOD
			;			(not used in Hebrew)
121 121	<  U+05F2	; CCAT YY	Hebrew ligature YIDDISH DOUBLE YOD = tsvey yudn
			;			(not used in Hebrew)

; Additional punctuation

035	<> U+05F3	; Hebrew punctuation GERESH
034	<> U+05F4	; Hebrew punctuation GERSHAYIM

; General punctuation: Bidirectional control characters

226	<  U+200C	; ZERO WIDTH NON-JOINER
226	<  U+200D	; ZERO WIDTH JOINER
; 226 > U+200D is used to disable switch to final forms, so not word breaking
; 226 ZWL ZeroWidthLetter prevents finals processing on consonants KMNCP
226 	>		;Delete ZERO WIDTH LETTER, no longer needed in Unicode data
;since finals are separate encoding. Any final followed by 226 should go to normal
;nonfinal form. This would normally only occur in isolation. 8/18/03 jw

; Hebrew presentation forms

121 105	<  U+FB1D	; CCAT YI	Hebrew letter YOD with HIRIQ		=05D9+05B4
;	<> U+FB1E	;		Hebrew point JUDEO-SPANISH VARIKA	~0306
121 121 097 <  U+FB1F	; CCAT YYA	Hebrew ligature YIDDISH YOD YOD PATAH	=05F2+05B7
118	<  U+FB20	; CCAT (	Hebrew letter ALTERNATIVE AYIN		~<font>05E2
039	<  U+FB21	; CCAT )	Hebrew letter WIDE ALEF			~<font>05D0
100	<  U+FB22	; CCAT D	Hebrew letter WIDE DALET		~<font>05D3
104	<  U+FB23	; CCAT H	Hebrew letter WIDE HE			~<font>05D4
107	<  U+FB24	; CCAT K	Hebrew letter WIDE KAF			~<font>05DB
108	<  U+FB25	; CCAT L	Hebrew letter WIDE LAMED		~<font>05DC
109	<  U+FB26	; CCAT M	Hebrew letter WIDE FINAL MEM		~<font>05DD
114	<  U+FB27	; CCAT R	Hebrew letter WIDE RESH			~<font>05E8
116	<  U+FB28	; CCAT T	Hebrew letter WIDE TAV			~<font>05EA
;	<> U+FB29	;		Hebrew letter ALTERNATIVE PLUS SIGN	~<font>002B
072	<  U+FB2A	; CCAT $	Hebrew letter SHIN with SHIN DOT	=05E9+05C1
087	<  U+FB2B	; CCAT &	Hebrew letter SHIN with SIN DOT		=05E9+05C2 
072 208	<  U+FB2C	; CCAT $.	Hebrew letter SHIN with DAGESH AND SHIN DOT=FB49+05C1
087 208	<  U+FB2D	; CCAT &.	Hebrew letter SHIN with DAGESH AND SIN DOT=FB49+05C2
039 097	<  U+FB2E	; CCAT )A	Hebrew letter ALEF with PATAH		=05D0+05B7
039 065	<  U+FB2F	; CCAT )F	Hebrew letter ALEF with QAMATS		=05D0+05B8
039 208	<  U+FB30	; CCAT ).	Hebrew letter ALEF with MAPIQ		=05D0+05BC
098 208	<  U+FB31	; CCAT B.	Hebrew letter BET with DAGESH		=05D1+05BC
103 208	<  U+FB32	; CCAT G.	Hebrew letter GIMEL with DAGESH		=05D2+05BC
100 208	<  U+FB33	; CCAT D.	Hebrew letter DALET with DAGESH		=05D3+05BC
104 208	<  U+FB34	; CCAT H.	Hebrew letter HE with MAPIQ		=05D4+05BC
251	<  U+FB35	; CCAT W.	Hebrew letter VAV with DAGESH		=05D5+05BC
 
;SHUREQ processing
119 208 [any_p_se]=a 251   < U+05D5 U+05BC [any_p_se]=a U+05D5 U+05BC ;9/26/03 try this. keep.

;jk SEQUENCE HERE! works
UniClass[any_p_or_vow] = ( [any_p_se] [vow_se] )
251    <> U+05D5 U+05BC / _ ( [any_p_se] ^[vow_se] | ^[any_p_or_vow] )

119 208 251 < U+05D5 U+05BC U+05D5 U+05BC/ [vow_se] _

;HOLAM/WAW and WAW/HOLAM procesing
;change vocalic holam waw to waw-holam order and move cants
079 [any_p_se]?=b [any_p_se]?=c 119 / [allcons] [dr_se]? _ > [any_p_se]?=b [any_p_se]?=c U+05D5 U+05B9
;this isn't called after lamed if cant = U+05A8, because of changes above
;corrected in a later pass below

;implementation of 05BA = holam haser for consonantal WO
119 [dr_se]? [any_p_se]{0,2} 079 <> U+05D5 [dr_se]? [any_p_se]{0,2} U+05BA

;where there is consonantal waw followed by plene holam
119=a [dr_se]?=b [any_p_se]?=f 079=c [any_p_se]?=d 119=e  <> U+05D5=a [dr_se]?=b [any_p_se]?=d [any_p_se]?=f U+05D5=e U+05B9=c
119=a [dr_se]?=b                     [any_p_se]?=d 244     > U+05D5=a [dr_se]?=b [any_p_se]?=d               U+05D5   U+05B9

;where holam waw is followed by a vowel, waw is consonantal and the holam belongs to the preceding consonant
079 [any_p_se]?=b [any_p_se]?=c 119 [vow_se]=d <> [any_p_se]?=b [any_p_se]?=c U+05B9 U+05D5 [vow_se]=d

;RETURN TRIP
;needs a return trip for the w/o order in old text
[any_p_se]?=a [any_p_se]?=b 244	< U+05B9 [any_p_se]?=a [any_p_se]?=b U+05D5 			   ;old vocalic     OW

;but also need to distinguish return trip from old and new text with U+05D5 U+05B9 order
244			< U+05D5	  U+05B9 / [cons]   [shindots]? [dr_se]? [any_p_se]{0,2} _ ;new vocalic     OW
119 [dr_se]? 079	< U+05D5 [dr_se]? U+05B9 / [vow_se] 			 [any_p_se]{0,2} _ ;old consonantal WO

;special case (Dan 5.6.3, 5.9.6) where the preceding vowel is plene hireq
119 [dr_se]? 079	< U+05D5 [dr_se]? U+05B9 / U+05D6 U+05B4 [any_p_se]{0,2} U+05D9 [any_p_se]{0,2} _ ;old consonantal WO

;special case (Jer 52.19.7) where the yodh is consonantal and the waw vocalic
244			< U+05D5          U+05B9 / U+05E7 U+05B4                 U+05D9 U+0594          _ ;new vocalic     OW


122 208	<  U+FB36	; CCAT Z.	Hebrew letter ZAYIN with DAGESH		=05D6+05BC
088 208	<  U+FB38	; CCAT +.	Hebrew letter TET with DAGESH		=05D8+05BC
121 208	<  U+FB39	; CCAT Y.	Hebrew letter YOD with DAGESH		=05D9+05BC
107 208	<  U+FB3A	; CCAT K.	Hebrew letter FINAL KAF with DAGESH	=05DA+05BC
172	 > U+05DA U+05BC;			ditto DE
107 208	<  U+FB3B	; CCAT K.	Hebrew letter KAF with DAGESH		=05DB+05BC
108 208	<  U+FB3C	; CCAT L.	Hebrew letter LAMED with DAGESH		=05DC+05BC
109 208	<  U+FB3E	; CCAT M.	Hebrew letter MEM with DAGESH		=05DE+05BC
110 208	<  U+FB40	; CCAT N.	Hebrew letter NUN with DAGESH		=05E0+05BC
115 208	<  U+FB41	; CCAT S.	Hebrew letter SAMEKH with DAGESH	=05E1+05BC
112 208	<  U+FB43	; CCAT P.	Hebrew letter FINAL PE with DAGESH	=05E3+05BC
169	 > U+05E3 U+05BC;			ditto DE
112 208	<  U+FB44	; CCAT P.	Hebrew letter PE with DAGESH		=05E4+05BC
099 208	<  U+FB46	; CCAT C.	Hebrew letter TSADI with DAGESH		=05E6+05BC
113 208	<  U+FB47	; CCAT Q.	Hebrew letter QOF with DAGESH		=05E7+05BC
114 208	<  U+FB48	; CCAT R.	Hebrew letter RESH with DAGESH		=05E8+05BC
083 208	<  U+FB49	; CCAT #.	Hebrew letter SHIN with DAGESH		=05E9+05BC
116 208	<  U+FB4A	; CCAT T.	Hebrew letter TAV with DAGESH		=05EA+05BC
119 079	<  U+FB4B	; CCAT WO	Hebrew letter VAV with holem		=05D5+05B9
098 038	<  U+FB4C	; CCAT B,	Hebrew letter BET with RAFE		=05D1+05BF
107 038	<  U+FB4D	; CCAT K,	Hebrew letter KAF with RAFE		=05DB+05BF
112 038	<  U+FB4E	; CCAT P,	Hebrew letter PE with RAFE		=05E4+05BF
039 108	<  U+FB4F	; CCAT )L	Hebrew ligature ALEF LAMED		=05D0+05DC
			;			(not used in Biblical Hebrew)

; SIL Ezra characters not represented in Unicode

;036	<> U+0306	; (CCAT 52)	Hebrew mark NUMBER (incorrectly used for 05C4)
; *** temporary replacement for the above for processing miscoded BHS text ***
; *** note that Ezra 036 is never used for NUMBER in actual Biblical Hebrew
036	<> U+0307	; Hebrew mark NUMBER-single dot
037	<> U+0308	; Hebrew mark NUMBER-double dot  NEW ORDER
043	 > U+002A	; Combining ASTERISK ABOVE - change to reg. asterisk 8/15/03 jw
; note that setuma & petuha codes should appear only in running text with actual whitespace on at least one side
060 / [WS] _ [WS]   <> U+05E1 / [WS] _ [WS]  ;original code left out setuma/petuha with no whitespace after (Was   00A7) 
060 / [WS] _ [num]   <> U+05E1 / [WS] _ [num]; CCAT S	Hebrew punctuation SETUMA 
060 / [WS] _ #  <> U+05E1 / [WS] _ #	; CCAT S	Hebrew punctuation SETUMA
060 / # _ [WS]  <> U+05E1 / # _ [WS]	; CCAT S	Hebrew punctuation SETUMA 

062 / [WS] _ [WS]   <> U+05E4 / [WS] _ [WS]  ;original code left out setuma/petuha with no whitespace after. (Was  00B6)jw
062 / [WS] _ [num]   <> U+05E4 / [WS] _ [num]; CCAT S	Hebrew punctuation PETUHA 
062 / [WS] _ #  <> U+05E4 / [WS] _ #	; CCAT S	Hebrew punctuation PETUHA
062 / # _ [WS]  <> U+05E4 / # _ [WS]	; CCAT S	Hebrew punctuation PETUHA 

115	<> U+05E1	; CCAT S	Hebrew letter SAMEKH

112	<> U+05E4	; CCAT P	Hebrew letter PE
064	<> U+25CC	;	 	PLACEHOLDER
078	<  U+05E0 U+034F U+0307 ; CCAT N]8	Hebrew letter INVERTED NUN jw changed to PUA F300, changed to seq 8/15/03
;					changed back to 0307 9/9/03 jw
;					changed to (Unicode 4.1)
078	<> U+05C6 U+0307 ; U+05C6 added in Unicode 4.1 cjs April 2006 U+034F deleted cjs March 2007
102	<> U+05B1 ZWJ U+05BD; CCAT :E35	Hebrew point HATAF SEGOL with METEG - cjs added ZWJ
102	<  U+05B1     U+05BD; CCAT :E35	Hebrew point HATAF SEGOL with METEG
111	 > U+05B8	; CCAT F	Hebrew point QAMATS HATUF (kamats-o)
196	 > U+05B8	;			ditto DE
211	 > U+05B8	;			ditto DE
224	 > U+05B8	;			ditto DE
167	 > U+05DA U+05B8; CCAT KF	Hebrew letter FINAL KAF with QAMATS
168	 > U+05DF U+05B8; CCAT NF	Hebrew letter FINAL NUN with QAMATS
170	 > U+05DA U+05B0; CCAT K:	Hebrew letter FINAL KAF with SHEVA
;173	 > U+FB3A U+05B8; CCAT K.F	Hebrew letter FINAL KAF with DAGESH and QAMATS
173	 > U+05DA U+05BC U+05B8; CCAT K.F	Hebrew letter FINAL KAF with DAGESH and QAMATS jw changed from FB3A 05B8
179	<> U+05C5	; CCAT 53	Hebrew mark LOWER DOT  ;cjs Unicode 4.1 added U+05C5
179	<  U+0323	; CCAT 53	Hebrew mark LOWER DOT  ;jw 7/24/03 don't use F301 any longer
179	<  U+F301	; CCAT 53	Hebrew mark LOWER DOT  ;jw changed to PUA
180	<> U+05AB	; CCAT ^	Hebrew ACCENT
181	 > U+05AB	;			ditto DE
202	 > U+05B5 U+05D9; CCAT "Y	Hebrew letter TSERE with YOD
203	 > U+05B5 U+05D4; CCAT "H	Hebrew letter TSERE with HE
228	 > U+05B8 U+05D4; CCAT FH	Hebrew letter QAMATS with HE
234	 > U+05B6 U+05D9; CCAT EY	Hebrew letter SEGOL with YOD
235	 > U+05B6 U+05D4; CCAT EH	Hebrew letter SEGOL with HE
238	 > U+05B4 U+05D9; CCAT IY	Hebrew letter HIRIQ with YOD
244	<> U+05B9 U+05D5; CCAT OW	Hebrew letter holem VAV (holem TO RIGHT); return conversion with
;intervening cants is handled in ASCII pass. 9/12/03 jw
246	 > U+05B9 U+05D4; CCAT OH	Hebrew letter holem with HE
248	 > U+05B0	; CCAT :	Hebrew point SILENT SHEVA
205	 > U+05B0	;			ditto DE
218	 > U+05B0	;			ditto DE
237	 > U+05B0	;			ditto DE
254	<> U+05B2 ZWJ U+05BD; CCAT :A35	Hebrew point HATAF PATAH with METEG - cjs added ZWJ
254	<  U+05B2     U+05BD; CCAT :A35	Hebrew point HATAF PATAH with METEG
255	 > 		;		THIN SPACE - omitted as used only for DE spacing

; miscellaneous characters in SIL Hebrew, not converted
; checked that all characters 0-255 have been included PK 1/12/2000

032	<> space
033	<> exclamation_mark
040	<> U+0028 ;jw was right_parenthesis
041	<> U+0029 ;jw was left_parenthesis
042	<> asterisk
044	<> comma
046	<> full_stop
047	<> solidus
048	<> digit_zero
049	<> digit_one
050	<> digit_two
051	<> digit_three
052	<> digit_four
053	<> digit_five
054	<> digit_six
055	<> digit_seven
056	<> digit_eight
057	<> digit_nine
059	<> semicolon
063	<> question_mark
091	<> U+005B ; jw from right_square_bracket Note: word-initial paired items
092	<> reverse_solidus  ;(backslash)        ;will not display in Ezra SIL in Word 2002
093	<> U+005D ; jw from left_square_bracket
123	<> U+007B ; jw from right_curly_bracket
125	<> U+007D;  jw from left_curly_bracket
145	<> U+2019	; RIGHT SINGLE QUOTATION MARK
146	<> U+2018	; LEFT SINGLE QUOTATION MARK
147	<> U+201E	; DOUBLE LOW-9 QUOTATION MARK
148	<> U+201C	; jw from 0201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK
150	<> U+2013	; EN DASH
151	<> U+2014	; EM DASH
160	<> U+00A0	; NO-BREAK SPACE       ;jw this item is Reserved in font
171	<> U+00AB	; jw was 00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
176	<> U+00A0	; NO-BREAK SPACE
187	<> U+00BB	; jw was 00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK

;special case at Jdg 19.13.3 - seems to have the wrong vowel-cant order in WLC
108 252 107 156 65 > U+05DC U+05B0 U+05DA U+05B8 U+05A5

;special case at 2Sa 3.8.11 added cjs Oct 2007
;BHS shows the cant to the left of the holem, so use U+05AE and insert ZWNJ - extend the context if necessary
110 79 141 107 > U+05E0  U+05B9 U+05AE ZWNJ U+05DB

;special case at 2Sa 3.8.11 added cjs Oct 2007
;BHS shows the cant to the right, so insert ZWNJ
;also move the word-initial telisha magnum (MC 14) at this stage to avoid a match in the [pp] rearrangement
;note that shureq is already decomposed 251 > 119 208
132 119 208 109 105 132 <> U+05D5 U+05BC U+05A0 ZWNJ U+05DE U+05B4 U+05A0


;This pass does final rearrangment to NEW ORDER. Also lots of documentation here.

pass (Unicode)

; **********************************************************************************
; TIPS
; **********************************************************************************
; / follow this with pre-context _ post-context
; | match either preceding or following context
; ? match preceding item 0 or 1 times   (optional single)
; * match preceding item 0 to 15 times  (optional multiple, deleting all but first)
; + match preceding item 1 to 15 times  (one occurrence required)
; {0,2} means 0-2 times

; pp=pre-positive cant
; p = point cant (no vowel)
; post=post-positive cant
; dr=dagesh/rafe
; vow = vowel

Class [pp]	=	( U+059D U+05A0 U+059A U+05AD )
Class [high_p]	=	( U+0593 U+0594 U+0595 U+0597 U+0598 U+0599 U+059C U+059E U+059F \
		      		U+05A1 U+05A8 U+05A9 U+05AB U+05AC U+05AE U+05C4 U+0307 U+0308 )
;added last 3 9/9/03 jw, added the [hipost] cjs May 2006

Class [low_p]	=	( U+0591 U+0596 U+059B U+05A3 U+05A4 U+05A5 U+05A6 U+05A7 \
				U+05AA U+05BD U+05C5 U+05A2 ) ;cjs added U+05C5 and U+05A2


Class [hipost]	=	( U+0592 U+0599 U+05A9 U+05AE ) ;cjs added U+0592


Class [cons]	=	( U+05D0 U+05D1 U+05D2 U+05D3 U+05D4 U+05D5 U+05D6 U+05D7 \
				U+05D8 U+05D9 U+05DB U+05DC U+05DE U+05E0 U+05E1 U+05E2 \
				U+05E4 U+05E6 U+05E7 U+05E8 U+05E9 U+05EA )

Class [dr]	=	( U+05BC U+05BF )
Class [vow] 	=	( U+05B0 ..        U+05BB U+05C7 )
Class [lo_vow]	=	( U+05B0 .. U+05B8 U+05BB U+05C7 )


Class [shindots] =	( U+05C1 U+05C2 )


;The purpose of this rearrangement is to meet Unicode requirements of word-initial marks following the base 
;consonant and vowel. The second purpose is to place the data into the NEW ORDER, decided by the font designer 
;group of Microsoft, Antioch, Tiro Typeworks, SBL, and SIL in May 2003. 

;The SIL Ezra (old) order is:
;[pre-positive] [consonant] [shin/sin dots][dagesh][rafe] [right meteg] [vowel] [cants] ([2nd vowel,cants])[low post-positive]

;The NEW ORDER is:
;[consonant] [shin/sin dot] [dagesh][rafe] [holem] [right meteg] [low vowels] [low cantillations] (low 2nd vowel) 
;[low pre-positive] [high pre-positive] [hi cant or dots] [low post-positive] (ZWNJ+any mark)

;Things to note about this order:
;1) No cantillation mark ever comes before a vowel, except the very exceptional right meteg. We hope to have a Unicode ;character assigned for it in the future. It will probably then be moved to the low cantillations group.
;2) All low marks are contiguous. All high marks (exc. holem) are contiguous, (since there are no low post-postives.)
;3) Normal order then is consonant-vowel-low cants-high cants. This is generally the same order as MC and SIL Ezra, 
;although it was not unusual to have high cants then low cants or other mixed groupings.
;4) The NEW ORDER is not canononical order or NFC/NFD order. Data which is in these orders will likely not display
;properly with any of the group's fonts.
;5) Any mark which must occur out of order in order to get proper rendering, such as left meteg on hataf,
;etc. should occur with ZWNJ, ZWJ, or CGJ and be last, where possible. See below:

;<hataf, meteg> = variable rendering depending on font (medial meteg in 
;SBL Hebrew, Vusillus, etc; left meteg in some other fonts)
;<hataf, ZWJ, meteg> = always medial ligated form
;<hataf, ZWNJ, meteg> = always left meteg (post hataf)
;<meteg, CGJ, hataf> = always right meteg (pre hataf)

;This does mean, of course, that if you can't reliably determine what font 
;will be used to display the text, e.g. in web publishing, you are going to 
;want to use ZWJ and ZWNJ in every case and not rely on the default 
;rendering of particular fonts for <hataf, meteg>.

;Another way to look at it:
;ZWJ - for circumstances in which actual ligation occurs at the glyph level.
;ZWNJ - for prevention of ligation.
;CGJ - for controlling mark ordering.
;These characters are available on the Ezra keyboards, but may or may not be accessible in Word. 
;They can be added with Unicode macros, if necessary.


;**************** START OF REARRANGEMENT **************************************** 
; OUT OF BOUNDS - whitespace or word-breaking, incl. numbers, punctuation, latin punctuation, dashes, spaces, no thinspace, quotes. It is likely that I have left something out of this list. 8/21/03 jw

UniClass	[OOB] 	= ( U+0009 U+000A U+000D U+0020 U+00A0 U+0020 .. U+0040 U+05F3 U+05F4 \
				U+05BE U+05C0 U+05C3 U+005B .. U+0060 \
				U+007B .. U+007E U+00A7 U+00AB U+00AF \
				U+00B6 U+00BB U+00BF U+05BE U+05C0 U+05C3 U+2000 .. U+200A U+2010 .. U+2021 \
				U+2039 U+203A );deleted 25CC. JK says ignore 25CCs inserted by Uniscribe. 8/25/03 
;				added Hebrew punctuation 8/25/03
;A dotted circle followed by a pre-positive accent is certainly legal in itself. So I don't want the 
;rearrangement to run on a typed dotted circle. It is removed from list above.

;NOTE pp_se MUST be used with WhiteSpace/OOB environments. Otherwise, you will have marks identified
;as word-initial which are not.
; Added shindots to all logic 10/6/03 jw 
; added ZWJ (U+200D) cjs April 2006
; added CGJ (U+034F) and ZWNJ (U+200C) cjs Feb 2007

[pp]=a [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g / [OOB] _  >  @b @j @c @l @m @d @k @h @e @g @a @i @f  
[pp]=a [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g /  # _     >  @b @j @c @l @m @d @k @h @e @g @a @i @f 

;normal rearrangement - no pre-positve, no second vowel
[cons]=b [shindots]=j? [dr]{0,2}=c [vow]{0,2}=d [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g  >  @b @j @c @d @h @e @g @i @f

;rare 2nd vowel rearrangement, doesn't add CGJ-034F before 2nd vowel because that interferes with positioning in Word
;although is helpful for preventing canonical reordering.
;this is now recommended in Unicode 4.1, so it was added here cjs April 2006
[cons]=b [shindots]=k? [dr]{0,2}=c [vow]{1,2}=d [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g [vow]=j  >  @b @k @c [vow]{1,2}=d @h @e @g U+034F @j @i @f

;NOTEPAD is more reliable. WORD 2002 is not reliably displaying sequences of Latin marks RTL. We are avoiding this problem
;by not encoding 2 Latin marks in a row.

;**************** RETURN TRIP HERE **************************************** 
;2nd vowel
; @b @k @c @d @e @h @f @j		< [cons]=b [shindots]=k? [dr]{0,2}=c [vow]{1,2}=d [low_p]{0,2}=e U+05C5?=h U+034F [vow]=j [high_p]{0,2}=f  
; changed to fix doubling of hireq in Jerusalem on return trip cjs May 2006
; return trip is also needed for back conversion of older text lacking CGJ - note the ? with U+034F
 @b @k @c [vow]{1,2}=d @e @h @f @j	< [cons]=b [shindots]=k? [dr]{0,2}=c [vow]{1,2}=d [low_p]{0,2}=e U+05C5?=h U+034F? [vow]=j [high_p]{0,2}=f  

;regular consonant stays same
 @b @j @c @d @e @h @f			< [cons]=b [shindots]=j? [dr]{0,2}=c [vow]{0,2}=d [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f 

; prepositives
;cjs added ZWJ = 200D May 2006 and CGJ (U+034F) and ZWNJ (U+200C) cjs Feb 2007
@a @b @j @c @l @m @d @k @e @h @f		< [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [low_p]{0,2}=e U+05C5?=h [pp]=a [high_p]{0,2}=f  /  # _  
@a @b @j @c @l @m @d @k @e @h @f		< [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [low_p]{0,2}=e U+05C5?=h [pp]=a [high_p]{0,2}=f  / [OOB] _ 

;postpositives are untouched because they are always last in both encodings.


; ***********
; extra pass added to insert CGJ, ZWJ, ZWNJ into vowel-meteg or similar sequences
; also corrects miscellaneous problems


pass (Unicode)

Class [WS]      = ( U+0009 U+000A U+000D U+0020 U+00A0 )
Class [lo_vow]  = ( U+05B0 U+05B4 U+05B5 U+05B6 U+05B7 U+05B8 U+05BB U+05C7 )
Class [hataf]   = ( U+05B1 .. U+05B3 )
Class [lo_cant] = ( U+0591 U+0596 U+059A U+059B U+05A2 U+05A3 U+05A4 U+05A5 U+05A6 U+05A7 \
			U+05AA U+05AD U+05C5 )
Class [hi_cant] = ( U+0592 U+0593 U+0594 U+0595 U+0597 U+0598 U+0599 U+059C U+059D U+059E U+059F \
			U+05A0 U+05A1 U+05A8 U+05A9 U+05AB U+05AC U+05AE U+05AF U+05C4 )

;this is the meteg stuff
;[lo_vow] U+05BD - canonical order so unchanged U+05BD = meteg
U+05BD [hi_cant]?=b	[lo_vow]=a 	<> U+05BD		U+034F [lo_vow]=a [hi_cant]?=b	;CGJ
U+05BD [lo_cant]?=b 	[lo_vow]=a 	<> U+05BD [lo_cant]?=b	U+034F [lo_vow]=a		;CGJ

;medial meteg with hataf needs ZWJ - dealt with above - see 102 and 254
;however the ZWJ has to be removed for the back conversion to work properly with pre-pos cants
;even though this loses the left/medial distinction in the return trip
[hataf] U+05BD	  <  [hataf] U+200D U+05BD	;ZWJ	medial

[hataf] U+05BD    <> [hataf] U+200C U+05BD	;ZWNJ	left
U+05BD  [hataf]=a <> U+05BD  U+034F [hataf]=a	;CGJ	right

;when metheg (silluq) is to the left of another low cant, it needs a CGJ to hold it on normalisation
U+05BD / [lo_cant] _ <> U+034F U+05BD

; miscellaneous problems
; 1. holem-waw with lamed and azla/qadma cant
; the next line is not needed on the return trip and would give a mismatch at Dan 6.5.4
U+05B9 U+05A8 U+05D5 / U+05DC U+05BC? _ > U+05A8 U+05D5 U+05B9

; 2. holam as a second vowel is rare (only 2Ki 21.26.1) but must occur first and needs the CGJ to prevent reordering
[lo_vow]=a U+034F U+05B9=e  > U+05B9=e U+034F [lo_vow]=a
[lo_vow]=a        U+05B9=e <  U+05B9=e U+034F [lo_vow]=a

; 3. special correction for the unique Job 6.10.1 with no reversal
U+05A5 U+05D5 U+05BC / ( [WS] | # ) _ > U+05D5 U+05BC U+05A5