1/* This is the Assembler Pre-Processor
2   Copyright (C) 1987-2017 Free Software Foundation, Inc.
3
4   This file is part of GAS, the GNU Assembler.
5
6   GAS is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3, or (at your option)
9   any later version.
10
11   GAS is distributed in the hope that it will be useful, but WITHOUT
12   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
14   License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with GAS; see the file COPYING.  If not, write to the Free
18   Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
19   02110-1301, USA.  */
20
21/* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
22/* App, the assembler pre-processor.  This pre-processor strips out
23   excess spaces, turns single-quoted characters into a decimal
24   constant, and turns the # in # <number> <filename> <garbage> into a
25   .linefile.  This needs better error-handling.  */
26
27#include "as.h"
28
29#if (__STDC__ != 1)
30#ifndef const
31#define const  /* empty */
32#endif
33#endif
34
35#ifdef H_TICK_HEX
36int enable_h_tick_hex = 0;
37#endif
38
39#ifdef TC_M68K
40/* Whether we are scrubbing in m68k MRI mode.  This is different from
41   flag_m68k_mri, because the two flags will be affected by the .mri
42   pseudo-op at different times.  */
43static int scrub_m68k_mri;
44
45/* The pseudo-op which switches in and out of MRI mode.  See the
46   comment in do_scrub_chars.  */
47static const char mri_pseudo[] = ".mri 0";
48#else
49#define scrub_m68k_mri 0
50#endif
51
52#if defined TC_ARM && defined OBJ_ELF
53/* The pseudo-op for which we need to special-case `@' characters.
54   See the comment in do_scrub_chars.  */
55static const char   symver_pseudo[] = ".symver";
56static const char * symver_state;
57#endif
58
59static char lex[256];
60static const char symbol_chars[] =
61"$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
62
63#define LEX_IS_SYMBOL_COMPONENT		1
64#define LEX_IS_WHITESPACE		2
65#define LEX_IS_LINE_SEPARATOR		3
66#define LEX_IS_COMMENT_START		4
67#define LEX_IS_LINE_COMMENT_START	5
68#define	LEX_IS_TWOCHAR_COMMENT_1ST	6
69#define	LEX_IS_STRINGQUOTE		8
70#define	LEX_IS_COLON			9
71#define	LEX_IS_NEWLINE			10
72#define	LEX_IS_ONECHAR_QUOTE		11
73#ifdef TC_V850
74#define LEX_IS_DOUBLEDASH_1ST		12
75#endif
76#ifdef TC_M32R
77#define DOUBLEBAR_PARALLEL
78#endif
79#ifdef DOUBLEBAR_PARALLEL
80#define LEX_IS_DOUBLEBAR_1ST		13
81#endif
82#define LEX_IS_PARALLEL_SEPARATOR	14
83#ifdef H_TICK_HEX
84#define LEX_IS_H			15
85#endif
86#define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
87#define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
88#define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
89#define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
90#define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
91#define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
92#define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
93
94static int process_escape (int);
95
96/* FIXME-soon: The entire lexer/parser thingy should be
97   built statically at compile time rather than dynamically
98   each and every time the assembler is run.  xoxorich.  */
99
100void
101do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
102{
103  const char *p;
104  int c;
105
106  lex[' '] = LEX_IS_WHITESPACE;
107  lex['\t'] = LEX_IS_WHITESPACE;
108  lex['\r'] = LEX_IS_WHITESPACE;
109  lex['\n'] = LEX_IS_NEWLINE;
110  lex[':'] = LEX_IS_COLON;
111
112#ifdef TC_M68K
113  scrub_m68k_mri = m68k_mri;
114
115  if (! m68k_mri)
116#endif
117    {
118      lex['"'] = LEX_IS_STRINGQUOTE;
119
120#if ! defined (TC_HPPA) && ! defined (TC_I370)
121      /* I370 uses single-quotes to delimit integer, float constants.  */
122      lex['\''] = LEX_IS_ONECHAR_QUOTE;
123#endif
124
125#ifdef SINGLE_QUOTE_STRINGS
126      lex['\''] = LEX_IS_STRINGQUOTE;
127#endif
128    }
129
130  /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
131     in state 5 of do_scrub_chars must be changed.  */
132
133  /* Note that these override the previous defaults, e.g. if ';' is a
134     comment char, then it isn't a line separator.  */
135  for (p = symbol_chars; *p; ++p)
136    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
137
138  for (c = 128; c < 256; ++c)
139    lex[c] = LEX_IS_SYMBOL_COMPONENT;
140
141#ifdef tc_symbol_chars
142  /* This macro permits the processor to specify all characters which
143     may appears in an operand.  This will prevent the scrubber from
144     discarding meaningful whitespace in certain cases.  The i386
145     backend uses this to support prefixes, which can confuse the
146     scrubber as to whether it is parsing operands or opcodes.  */
147  for (p = tc_symbol_chars; *p; ++p)
148    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
149#endif
150
151  /* The m68k backend wants to be able to change comment_chars.  */
152#ifndef tc_comment_chars
153#define tc_comment_chars comment_chars
154#endif
155  for (p = tc_comment_chars; *p; p++)
156    lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
157
158  for (p = line_comment_chars; *p; p++)
159    lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
160
161#ifndef tc_line_separator_chars
162#define tc_line_separator_chars line_separator_chars
163#endif
164  for (p = tc_line_separator_chars; *p; p++)
165    lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
166
167#ifdef tc_parallel_separator_chars
168  /* This macro permits the processor to specify all characters which
169     separate parallel insns on the same line.  */
170  for (p = tc_parallel_separator_chars; *p; p++)
171    lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
172#endif
173
174  /* Only allow slash-star comments if slash is not in use.
175     FIXME: This isn't right.  We should always permit them.  */
176  if (lex['/'] == 0)
177    lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
178
179#ifdef TC_M68K
180  if (m68k_mri)
181    {
182      lex['\''] = LEX_IS_STRINGQUOTE;
183      lex[';'] = LEX_IS_COMMENT_START;
184      lex['*'] = LEX_IS_LINE_COMMENT_START;
185      /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
186	 then it can't be used in an expression.  */
187      lex['!'] = LEX_IS_LINE_COMMENT_START;
188    }
189#endif
190
191#ifdef TC_V850
192  lex['-'] = LEX_IS_DOUBLEDASH_1ST;
193#endif
194#ifdef DOUBLEBAR_PARALLEL
195  lex['|'] = LEX_IS_DOUBLEBAR_1ST;
196#endif
197#ifdef TC_D30V
198  /* Must do this is we want VLIW instruction with "->" or "<-".  */
199  lex['-'] = LEX_IS_SYMBOL_COMPONENT;
200#endif
201
202#ifdef H_TICK_HEX
203  if (enable_h_tick_hex)
204    {
205      lex['h'] = LEX_IS_H;
206      lex['H'] = LEX_IS_H;
207    }
208#endif
209}
210
211/* Saved state of the scrubber.  */
212static int state;
213static int old_state;
214static const char *out_string;
215static char out_buf[20];
216static int add_newlines;
217static char *saved_input;
218static size_t saved_input_len;
219static char input_buffer[32 * 1024];
220static const char *mri_state;
221static char mri_last_ch;
222
223/* Data structure for saving the state of app across #include's.  Note that
224   app is called asynchronously to the parsing of the .include's, so our
225   state at the time .include is interpreted is completely unrelated.
226   That's why we have to save it all.  */
227
228struct app_save
229{
230  int          state;
231  int          old_state;
232  const char * out_string;
233  char         out_buf[sizeof (out_buf)];
234  int          add_newlines;
235  char *       saved_input;
236  size_t       saved_input_len;
237#ifdef TC_M68K
238  int          scrub_m68k_mri;
239#endif
240  const char * mri_state;
241  char         mri_last_ch;
242#if defined TC_ARM && defined OBJ_ELF
243  const char * symver_state;
244#endif
245};
246
247char *
248app_push (void)
249{
250  struct app_save *saved;
251
252  saved = XNEW (struct app_save);
253  saved->state = state;
254  saved->old_state = old_state;
255  saved->out_string = out_string;
256  memcpy (saved->out_buf, out_buf, sizeof (out_buf));
257  saved->add_newlines = add_newlines;
258  if (saved_input == NULL)
259    saved->saved_input = NULL;
260  else
261    {
262      saved->saved_input = XNEWVEC (char, saved_input_len);
263      memcpy (saved->saved_input, saved_input, saved_input_len);
264      saved->saved_input_len = saved_input_len;
265    }
266#ifdef TC_M68K
267  saved->scrub_m68k_mri = scrub_m68k_mri;
268#endif
269  saved->mri_state = mri_state;
270  saved->mri_last_ch = mri_last_ch;
271#if defined TC_ARM && defined OBJ_ELF
272  saved->symver_state = symver_state;
273#endif
274
275  /* do_scrub_begin() is not useful, just wastes time.  */
276
277  state = 0;
278  saved_input = NULL;
279  add_newlines = 0;
280
281  return (char *) saved;
282}
283
284void
285app_pop (char *arg)
286{
287  struct app_save *saved = (struct app_save *) arg;
288
289  /* There is no do_scrub_end ().  */
290  state = saved->state;
291  old_state = saved->old_state;
292  out_string = saved->out_string;
293  memcpy (out_buf, saved->out_buf, sizeof (out_buf));
294  add_newlines = saved->add_newlines;
295  if (saved->saved_input == NULL)
296    saved_input = NULL;
297  else
298    {
299      gas_assert (saved->saved_input_len <= sizeof (input_buffer));
300      memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
301      saved_input = input_buffer;
302      saved_input_len = saved->saved_input_len;
303      free (saved->saved_input);
304    }
305#ifdef TC_M68K
306  scrub_m68k_mri = saved->scrub_m68k_mri;
307#endif
308  mri_state = saved->mri_state;
309  mri_last_ch = saved->mri_last_ch;
310#if defined TC_ARM && defined OBJ_ELF
311  symver_state = saved->symver_state;
312#endif
313
314  free (arg);
315}
316
317/* @@ This assumes that \n &c are the same on host and target.  This is not
318   necessarily true.  */
319
320static int
321process_escape (int ch)
322{
323  switch (ch)
324    {
325    case 'b':
326      return '\b';
327    case 'f':
328      return '\f';
329    case 'n':
330      return '\n';
331    case 'r':
332      return '\r';
333    case 't':
334      return '\t';
335    case '\'':
336      return '\'';
337    case '"':
338      return '\"';
339    default:
340      return ch;
341    }
342}
343
344/* This function is called to process input characters.  The GET
345   parameter is used to retrieve more input characters.  GET should
346   set its parameter to point to a buffer, and return the length of
347   the buffer; it should return 0 at end of file.  The scrubbed output
348   characters are put into the buffer starting at TOSTART; the TOSTART
349   buffer is TOLEN bytes in length.  The function returns the number
350   of scrubbed characters put into TOSTART.  This will be TOLEN unless
351   end of file was seen.  This function is arranged as a state
352   machine, and saves its state so that it may return at any point.
353   This is the way the old code used to work.  */
354
355size_t
356do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
357{
358  char *to = tostart;
359  char *toend = tostart + tolen;
360  char *from;
361  char *fromend;
362  size_t fromlen;
363  int ch, ch2 = 0;
364  /* Character that started the string we're working on.  */
365  static char quotechar;
366
367  /*State 0: beginning of normal line
368	  1: After first whitespace on line (flush more white)
369	  2: After first non-white (opcode) on line (keep 1white)
370	  3: after second white on line (into operands) (flush white)
371	  4: after putting out a .linefile, put out digits
372	  5: parsing a string, then go to old-state
373	  6: putting out \ escape in a "d string.
374	  7: no longer used
375	  8: no longer used
376	  9: After seeing symbol char in state 3 (keep 1white after symchar)
377	 10: After seeing whitespace in state 9 (keep white before symchar)
378	 11: After seeing a symbol character in state 0 (eg a label definition)
379	 -1: output string in out_string and go to the state in old_state
380	 -2: flush text until a '*' '/' is seen, then go to state old_state
381#ifdef TC_V850
382	 12: After seeing a dash, looking for a second dash as a start
383	     of comment.
384#endif
385#ifdef DOUBLEBAR_PARALLEL
386	 13: After seeing a vertical bar, looking for a second
387	     vertical bar as a parallel expression separator.
388#endif
389#ifdef TC_PREDICATE_START_CHAR
390	 14: After seeing a predicate start character at state 0, looking
391	     for a predicate end character as predicate.
392	 15: After seeing a predicate start character at state 1, looking
393	     for a predicate end character as predicate.
394#endif
395#ifdef TC_Z80
396	 16: After seeing an 'a' or an 'A' at the start of a symbol
397	 17: After seeing an 'f' or an 'F' in state 16
398#endif
399	  */
400
401  /* I added states 9 and 10 because the MIPS ECOFF assembler uses
402     constructs like ``.loc 1 20''.  This was turning into ``.loc
403     120''.  States 9 and 10 ensure that a space is never dropped in
404     between characters which could appear in an identifier.  Ian
405     Taylor, ian@cygnus.com.
406
407     I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
408     correctly on the PA (and any other target where colons are optional).
409     Jeff Law, law@cs.utah.edu.
410
411     I added state 13 so that something like "cmp r1, r2 || trap #1" does not
412     get squashed into "cmp r1,r2||trap#1", with the all important space
413     between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
414
415  /* This macro gets the next input character.  */
416
417#define GET()							\
418  (from < fromend						\
419   ? * (unsigned char *) (from++)				\
420   : (saved_input = NULL,					\
421      fromlen = (*get) (input_buffer, sizeof input_buffer),	\
422      from = input_buffer,					\
423      fromend = from + fromlen,					\
424      (fromlen == 0						\
425       ? EOF							\
426       : * (unsigned char *) (from++))))
427
428  /* This macro pushes a character back on the input stream.  */
429
430#define UNGET(uch) (*--from = (uch))
431
432  /* This macro puts a character into the output buffer.  If this
433     character fills the output buffer, this macro jumps to the label
434     TOFULL.  We use this rather ugly approach because we need to
435     handle two different termination conditions: EOF on the input
436     stream, and a full output buffer.  It would be simpler if we
437     always read in the entire input stream before processing it, but
438     I don't want to make such a significant change to the assembler's
439     memory usage.  */
440
441#define PUT(pch)				\
442  do						\
443    {						\
444      *to++ = (pch);				\
445      if (to >= toend)				\
446	goto tofull;				\
447    }						\
448  while (0)
449
450  if (saved_input != NULL)
451    {
452      from = saved_input;
453      fromend = from + saved_input_len;
454    }
455  else
456    {
457      fromlen = (*get) (input_buffer, sizeof input_buffer);
458      if (fromlen == 0)
459	return 0;
460      from = input_buffer;
461      fromend = from + fromlen;
462    }
463
464  while (1)
465    {
466      /* The cases in this switch end with continue, in order to
467	 branch back to the top of this while loop and generate the
468	 next output character in the appropriate state.  */
469      switch (state)
470	{
471	case -1:
472	  ch = *out_string++;
473	  if (*out_string == '\0')
474	    {
475	      state = old_state;
476	      old_state = 3;
477	    }
478	  PUT (ch);
479	  continue;
480
481	case -2:
482	  for (;;)
483	    {
484	      do
485		{
486		  ch = GET ();
487
488		  if (ch == EOF)
489		    {
490		      as_warn (_("end of file in comment"));
491		      goto fromeof;
492		    }
493
494		  if (ch == '\n')
495		    PUT ('\n');
496		}
497	      while (ch != '*');
498
499	      while ((ch = GET ()) == '*')
500		;
501
502	      if (ch == EOF)
503		{
504		  as_warn (_("end of file in comment"));
505		  goto fromeof;
506		}
507
508	      if (ch == '/')
509		break;
510
511	      UNGET (ch);
512	    }
513
514	  state = old_state;
515	  UNGET (' ');
516	  continue;
517
518	case 4:
519	  ch = GET ();
520	  if (ch == EOF)
521	    goto fromeof;
522	  else if (ch >= '0' && ch <= '9')
523	    PUT (ch);
524	  else
525	    {
526	      while (ch != EOF && IS_WHITESPACE (ch))
527		ch = GET ();
528	      if (ch == '"')
529		{
530		  quotechar = ch;
531		  state = 5;
532		  old_state = 3;
533		  PUT (ch);
534		}
535	      else
536		{
537		  while (ch != EOF && ch != '\n')
538		    ch = GET ();
539		  state = 0;
540		  PUT (ch);
541		}
542	    }
543	  continue;
544
545	case 5:
546	  /* We are going to copy everything up to a quote character,
547	     with special handling for a backslash.  We try to
548	     optimize the copying in the simple case without using the
549	     GET and PUT macros.  */
550	  {
551	    char *s;
552	    ptrdiff_t len;
553
554	    for (s = from; s < fromend; s++)
555	      {
556		ch = *s;
557		if (ch == '\\'
558		    || ch == quotechar
559		    || ch == '\n')
560		  break;
561	      }
562	    len = s - from;
563	    if (len > toend - to)
564	      len = toend - to;
565	    if (len > 0)
566	      {
567		memcpy (to, from, len);
568		to += len;
569		from += len;
570		if (to >= toend)
571		  goto tofull;
572	      }
573	  }
574
575	  ch = GET ();
576	  if (ch == EOF)
577	    {
578	      /* This buffer is here specifically so
579		 that the UNGET below will work.  */
580	      static char one_char_buf[1];
581
582	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
583	      state = old_state;
584	      from = fromend = one_char_buf + 1;
585	      fromlen = 1;
586	      UNGET ('\n');
587	      PUT (quotechar);
588	    }
589	  else if (ch == quotechar)
590	    {
591	      state = old_state;
592	      PUT (ch);
593	    }
594#ifndef NO_STRING_ESCAPES
595	  else if (ch == '\\')
596	    {
597	      state = 6;
598	      PUT (ch);
599	    }
600#endif
601	  else if (scrub_m68k_mri && ch == '\n')
602	    {
603	      /* Just quietly terminate the string.  This permits lines like
604		   bne	label	loop if we haven't reach end yet.  */
605	      state = old_state;
606	      UNGET (ch);
607	      PUT ('\'');
608	    }
609	  else
610	    {
611	      PUT (ch);
612	    }
613	  continue;
614
615	case 6:
616	  state = 5;
617	  ch = GET ();
618	  switch (ch)
619	    {
620	      /* Handle strings broken across lines, by turning '\n' into
621		 '\\' and 'n'.  */
622	    case '\n':
623	      UNGET ('n');
624	      add_newlines++;
625	      PUT ('\\');
626	      continue;
627
628	    case EOF:
629	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
630	      PUT (quotechar);
631	      continue;
632
633	    case '"':
634	    case '\\':
635	    case 'b':
636	    case 'f':
637	    case 'n':
638	    case 'r':
639	    case 't':
640	    case 'v':
641	    case 'x':
642	    case 'X':
643	    case '0':
644	    case '1':
645	    case '2':
646	    case '3':
647	    case '4':
648	    case '5':
649	    case '6':
650	    case '7':
651	      break;
652
653	    default:
654#ifdef ONLY_STANDARD_ESCAPES
655	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
656#endif
657	      break;
658	    }
659	  PUT (ch);
660	  continue;
661
662#ifdef DOUBLEBAR_PARALLEL
663	case 13:
664	  ch = GET ();
665	  if (ch != '|')
666	    abort ();
667
668	  /* Reset back to state 1 and pretend that we are parsing a
669	     line from just after the first white space.  */
670	  state = 1;
671	  PUT ('|');
672#ifdef TC_TIC6X
673	  /* "||^" is used for SPMASKed instructions.  */
674	  ch = GET ();
675	  if (ch == EOF)
676	    goto fromeof;
677	  else if (ch == '^')
678	    PUT ('^');
679	  else
680	    UNGET (ch);
681#endif
682	  continue;
683#endif
684#ifdef TC_Z80
685	case 16:
686	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
687	  ch = GET ();
688	  if (ch == 'f' || ch == 'F')
689	    {
690	      state = 17;
691	      PUT (ch);
692	    }
693	  else
694	    {
695	      state = 9;
696	      break;
697	    }
698	  /* Fall through.  */
699	case 17:
700	  /* We have seen "af" at the start of a symbol,
701	     a ' here is a part of that symbol.  */
702	  ch = GET ();
703	  state = 9;
704	  if (ch == '\'')
705	    /* Change to avoid warning about unclosed string.  */
706	    PUT ('`');
707	  else if (ch != EOF)
708	    UNGET (ch);
709	  break;
710#endif
711	}
712
713      /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
714
715      /* flushchar: */
716      ch = GET ();
717
718#ifdef TC_PREDICATE_START_CHAR
719      if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
720	{
721	  state += 14;
722	  PUT (ch);
723	  continue;
724	}
725      else if (state == 14 || state == 15)
726	{
727	  if (ch == TC_PREDICATE_END_CHAR)
728	    {
729	      state -= 14;
730	      PUT (ch);
731	      ch = GET ();
732	    }
733	  else
734	    {
735	      PUT (ch);
736	      continue;
737	    }
738	}
739#endif
740
741    recycle:
742
743#if defined TC_ARM && defined OBJ_ELF
744      /* We need to watch out for .symver directives.  See the comment later
745	 in this function.  */
746      if (symver_state == NULL)
747	{
748	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
749	    symver_state = symver_pseudo + 1;
750	}
751      else
752	{
753	  /* We advance to the next state if we find the right
754	     character.  */
755	  if (ch != '\0' && (*symver_state == ch))
756	    ++symver_state;
757	  else if (*symver_state != '\0')
758	    /* We did not get the expected character, or we didn't
759	       get a valid terminating character after seeing the
760	       entire pseudo-op, so we must go back to the beginning.  */
761	    symver_state = NULL;
762	  else
763	    {
764	      /* We've read the entire pseudo-op.  If this is the end
765		 of the line, go back to the beginning.  */
766	      if (IS_NEWLINE (ch))
767		symver_state = NULL;
768	    }
769	}
770#endif /* TC_ARM && OBJ_ELF */
771
772#ifdef TC_M68K
773      /* We want to have pseudo-ops which control whether we are in
774	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
775	 the scrubber, that means that we need a special purpose
776	 recognizer here.  */
777      if (mri_state == NULL)
778	{
779	  if ((state == 0 || state == 1)
780	      && ch == mri_pseudo[0])
781	    mri_state = mri_pseudo + 1;
782	}
783      else
784	{
785	  /* We advance to the next state if we find the right
786	     character, or if we need a space character and we get any
787	     whitespace character, or if we need a '0' and we get a
788	     '1' (this is so that we only need one state to handle
789	     ``.mri 0'' and ``.mri 1'').  */
790	  if (ch != '\0'
791	      && (*mri_state == ch
792		  || (*mri_state == ' '
793		      && lex[ch] == LEX_IS_WHITESPACE)
794		  || (*mri_state == '0'
795		      && ch == '1')))
796	    {
797	      mri_last_ch = ch;
798	      ++mri_state;
799	    }
800	  else if (*mri_state != '\0'
801		   || (lex[ch] != LEX_IS_WHITESPACE
802		       && lex[ch] != LEX_IS_NEWLINE))
803	    {
804	      /* We did not get the expected character, or we didn't
805		 get a valid terminating character after seeing the
806		 entire pseudo-op, so we must go back to the
807		 beginning.  */
808	      mri_state = NULL;
809	    }
810	  else
811	    {
812	      /* We've read the entire pseudo-op.  mips_last_ch is
813		 either '0' or '1' indicating whether to enter or
814		 leave MRI mode.  */
815	      do_scrub_begin (mri_last_ch == '1');
816	      mri_state = NULL;
817
818	      /* We continue handling the character as usual.  The
819		 main gas reader must also handle the .mri pseudo-op
820		 to control expression parsing and the like.  */
821	    }
822	}
823#endif
824
825      if (ch == EOF)
826	{
827	  if (state != 0)
828	    {
829	      as_warn (_("end of file not at end of a line; newline inserted"));
830	      state = 0;
831	      PUT ('\n');
832	    }
833	  goto fromeof;
834	}
835
836      switch (lex[ch])
837	{
838	case LEX_IS_WHITESPACE:
839	  do
840	    {
841	      ch = GET ();
842	    }
843	  while (ch != EOF && IS_WHITESPACE (ch));
844	  if (ch == EOF)
845	    goto fromeof;
846
847	  if (state == 0)
848	    {
849	      /* Preserve a single whitespace character at the
850		 beginning of a line.  */
851	      state = 1;
852	      UNGET (ch);
853	      PUT (' ');
854	      break;
855	    }
856
857#ifdef KEEP_WHITE_AROUND_COLON
858	  if (lex[ch] == LEX_IS_COLON)
859	    {
860	      /* Only keep this white if there's no white *after* the
861		 colon.  */
862	      ch2 = GET ();
863	      if (ch2 != EOF)
864		UNGET (ch2);
865	      if (!IS_WHITESPACE (ch2))
866		{
867		  state = 9;
868		  UNGET (ch);
869		  PUT (' ');
870		  break;
871		}
872	    }
873#endif
874	  if (IS_COMMENT (ch)
875	      || ch == '/'
876	      || IS_LINE_SEPARATOR (ch)
877	      || IS_PARALLEL_SEPARATOR (ch))
878	    {
879	      if (scrub_m68k_mri)
880		{
881		  /* In MRI mode, we keep these spaces.  */
882		  UNGET (ch);
883		  PUT (' ');
884		  break;
885		}
886	      goto recycle;
887	    }
888
889	  /* If we're in state 2 or 11, we've seen a non-white
890	     character followed by whitespace.  If the next character
891	     is ':', this is whitespace after a label name which we
892	     normally must ignore.  In MRI mode, though, spaces are
893	     not permitted between the label and the colon.  */
894	  if ((state == 2 || state == 11)
895	      && lex[ch] == LEX_IS_COLON
896	      && ! scrub_m68k_mri)
897	    {
898	      state = 1;
899	      PUT (ch);
900	      break;
901	    }
902
903	  switch (state)
904	    {
905	    case 1:
906	      /* We can arrive here if we leave a leading whitespace
907		 character at the beginning of a line.  */
908	      goto recycle;
909	    case 2:
910	      state = 3;
911	      if (to + 1 < toend)
912		{
913		  /* Optimize common case by skipping UNGET/GET.  */
914		  PUT (' ');	/* Sp after opco */
915		  goto recycle;
916		}
917	      UNGET (ch);
918	      PUT (' ');
919	      break;
920	    case 3:
921#ifndef TC_KEEP_OPERAND_SPACES
922	      /* For TI C6X, we keep these spaces as they may separate
923		 functional unit specifiers from operands.  */
924	      if (scrub_m68k_mri)
925#endif
926		{
927		  /* In MRI mode, we keep these spaces.  */
928		  UNGET (ch);
929		  PUT (' ');
930		  break;
931		}
932	      goto recycle;	/* Sp in operands */
933	    case 9:
934	    case 10:
935#ifndef TC_KEEP_OPERAND_SPACES
936	      if (scrub_m68k_mri)
937#endif
938		{
939		  /* In MRI mode, we keep these spaces.  */
940		  state = 3;
941		  UNGET (ch);
942		  PUT (' ');
943		  break;
944		}
945	      state = 10;	/* Sp after symbol char */
946	      goto recycle;
947	    case 11:
948	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
949		state = 1;
950	      else
951		{
952		  /* We know that ch is not ':', since we tested that
953		     case above.  Therefore this is not a label, so it
954		     must be the opcode, and we've just seen the
955		     whitespace after it.  */
956		  state = 3;
957		}
958	      UNGET (ch);
959	      PUT (' ');	/* Sp after label definition.  */
960	      break;
961	    default:
962	      BAD_CASE (state);
963	    }
964	  break;
965
966	case LEX_IS_TWOCHAR_COMMENT_1ST:
967	  ch2 = GET ();
968	  if (ch2 == '*')
969	    {
970	      for (;;)
971		{
972		  do
973		    {
974		      ch2 = GET ();
975		      if (ch2 != EOF && IS_NEWLINE (ch2))
976			add_newlines++;
977		    }
978		  while (ch2 != EOF && ch2 != '*');
979
980		  while (ch2 == '*')
981		    ch2 = GET ();
982
983		  if (ch2 == EOF || ch2 == '/')
984		    break;
985
986		  /* This UNGET will ensure that we count newlines
987		     correctly.  */
988		  UNGET (ch2);
989		}
990
991	      if (ch2 == EOF)
992		as_warn (_("end of file in multiline comment"));
993
994	      ch = ' ';
995	      goto recycle;
996	    }
997#ifdef DOUBLESLASH_LINE_COMMENTS
998	  else if (ch2 == '/')
999	    {
1000	      do
1001		{
1002		  ch = GET ();
1003		}
1004	      while (ch != EOF && !IS_NEWLINE (ch));
1005	      if (ch == EOF)
1006		as_warn ("end of file in comment; newline inserted");
1007	      state = 0;
1008	      PUT ('\n');
1009	      break;
1010	    }
1011#endif
1012	  else
1013	    {
1014	      if (ch2 != EOF)
1015		UNGET (ch2);
1016	      if (state == 9 || state == 10)
1017		state = 3;
1018	      PUT (ch);
1019	    }
1020	  break;
1021
1022	case LEX_IS_STRINGQUOTE:
1023	  quotechar = ch;
1024	  if (state == 10)
1025	    {
1026	      /* Preserve the whitespace in foo "bar".  */
1027	      UNGET (ch);
1028	      state = 3;
1029	      PUT (' ');
1030
1031	      /* PUT didn't jump out.  We could just break, but we
1032		 know what will happen, so optimize a bit.  */
1033	      ch = GET ();
1034	      old_state = 3;
1035	    }
1036	  else if (state == 9)
1037	    old_state = 3;
1038	  else
1039	    old_state = state;
1040	  state = 5;
1041	  PUT (ch);
1042	  break;
1043
1044#ifndef IEEE_STYLE
1045	case LEX_IS_ONECHAR_QUOTE:
1046#ifdef H_TICK_HEX
1047	  if (state == 9 && enable_h_tick_hex)
1048	    {
1049	      char c;
1050
1051	      c = GET ();
1052	      as_warn ("'%c found after symbol", c);
1053	      UNGET (c);
1054	    }
1055#endif
1056	  if (state == 10)
1057	    {
1058	      /* Preserve the whitespace in foo 'b'.  */
1059	      UNGET (ch);
1060	      state = 3;
1061	      PUT (' ');
1062	      break;
1063	    }
1064	  ch = GET ();
1065	  if (ch == EOF)
1066	    {
1067	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1068	      ch = 0;
1069	    }
1070	  if (ch == '\\')
1071	    {
1072	      ch = GET ();
1073	      if (ch == EOF)
1074		{
1075		  as_warn (_("end of file in escape character"));
1076		  ch = '\\';
1077		}
1078	      else
1079		ch = process_escape (ch);
1080	    }
1081	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1082
1083	  /* None of these 'x constants for us.  We want 'x'.  */
1084	  if ((ch = GET ()) != '\'')
1085	    {
1086#ifdef REQUIRE_CHAR_CLOSE_QUOTE
1087	      as_warn (_("missing close quote; (assumed)"));
1088#else
1089	      if (ch != EOF)
1090		UNGET (ch);
1091#endif
1092	    }
1093	  if (strlen (out_buf) == 1)
1094	    {
1095	      PUT (out_buf[0]);
1096	      break;
1097	    }
1098	  if (state == 9)
1099	    old_state = 3;
1100	  else
1101	    old_state = state;
1102	  state = -1;
1103	  out_string = out_buf;
1104	  PUT (*out_string++);
1105	  break;
1106#endif
1107
1108	case LEX_IS_COLON:
1109#ifdef KEEP_WHITE_AROUND_COLON
1110	  state = 9;
1111#else
1112	  if (state == 9 || state == 10)
1113	    state = 3;
1114	  else if (state != 3)
1115	    state = 1;
1116#endif
1117	  PUT (ch);
1118	  break;
1119
1120	case LEX_IS_NEWLINE:
1121	  /* Roll out a bunch of newlines from inside comments, etc.  */
1122	  if (add_newlines)
1123	    {
1124	      --add_newlines;
1125	      UNGET (ch);
1126	    }
1127	  /* Fall through.  */
1128
1129	case LEX_IS_LINE_SEPARATOR:
1130	  state = 0;
1131	  PUT (ch);
1132	  break;
1133
1134	case LEX_IS_PARALLEL_SEPARATOR:
1135	  state = 1;
1136	  PUT (ch);
1137	  break;
1138
1139#ifdef TC_V850
1140	case LEX_IS_DOUBLEDASH_1ST:
1141	  ch2 = GET ();
1142	  if (ch2 != '-')
1143	    {
1144	      if (ch2 != EOF)
1145		UNGET (ch2);
1146	      goto de_fault;
1147	    }
1148	  /* Read and skip to end of line.  */
1149	  do
1150	    {
1151	      ch = GET ();
1152	    }
1153	  while (ch != EOF && ch != '\n');
1154
1155	  if (ch == EOF)
1156	    as_warn (_("end of file in comment; newline inserted"));
1157
1158	  state = 0;
1159	  PUT ('\n');
1160	  break;
1161#endif
1162#ifdef DOUBLEBAR_PARALLEL
1163	case LEX_IS_DOUBLEBAR_1ST:
1164	  ch2 = GET ();
1165	  if (ch2 != EOF)
1166	    UNGET (ch2);
1167	  if (ch2 != '|')
1168	    goto de_fault;
1169
1170	  /* Handle '||' in two states as invoking PUT twice might
1171	     result in the first one jumping out of this loop.  We'd
1172	     then lose track of the state and one '|' char.  */
1173	  state = 13;
1174	  PUT ('|');
1175	  break;
1176#endif
1177	case LEX_IS_LINE_COMMENT_START:
1178	  /* FIXME-someday: The two character comment stuff was badly
1179	     thought out.  On i386, we want '/' as line comment start
1180	     AND we want C style comments.  hence this hack.  The
1181	     whole lexical process should be reworked.  xoxorich.  */
1182	  if (ch == '/')
1183	    {
1184	      ch2 = GET ();
1185	      if (ch2 == '*')
1186		{
1187		  old_state = 3;
1188		  state = -2;
1189		  break;
1190		}
1191	      else if (ch2 != EOF)
1192		{
1193		  UNGET (ch2);
1194		}
1195	    }
1196
1197	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1198	    {
1199	      int startch;
1200
1201	      startch = ch;
1202
1203	      do
1204		{
1205		  ch = GET ();
1206		}
1207	      while (ch != EOF && IS_WHITESPACE (ch));
1208
1209	      if (ch == EOF)
1210		{
1211		  as_warn (_("end of file in comment; newline inserted"));
1212		  PUT ('\n');
1213		  break;
1214		}
1215
1216	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1217		{
1218		  /* Not a cpp line.  */
1219		  while (ch != EOF && !IS_NEWLINE (ch))
1220		    ch = GET ();
1221		  if (ch == EOF)
1222		    {
1223		      as_warn (_("end of file in comment; newline inserted"));
1224		      PUT ('\n');
1225		    }
1226		  else /* IS_NEWLINE (ch) */
1227		    {
1228		      /* To process non-zero add_newlines.  */
1229		      UNGET (ch);
1230		    }
1231		  state = 0;
1232		  break;
1233		}
1234	      /* Looks like `# 123 "filename"' from cpp.  */
1235	      UNGET (ch);
1236	      old_state = 4;
1237	      state = -1;
1238	      if (scrub_m68k_mri)
1239		out_string = "\tlinefile ";
1240	      else
1241		out_string = "\t.linefile ";
1242	      PUT (*out_string++);
1243	      break;
1244	    }
1245
1246#ifdef TC_D10V
1247	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1248	     Trap is the only short insn that has a first operand that is
1249	     neither register nor label.
1250	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1251	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1252	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1253	     only character in line_comment_chars for d10v, hence we
1254	     can recognize it as such.  */
1255	  /* An alternative approach would be to reset the state to 1 when
1256	     we see '||', '<'- or '->', but that seems to be overkill.  */
1257	  if (state == 10)
1258	    PUT (' ');
1259#endif
1260	  /* We have a line comment character which is not at the
1261	     start of a line.  If this is also a normal comment
1262	     character, fall through.  Otherwise treat it as a default
1263	     character.  */
1264	  if (strchr (tc_comment_chars, ch) == NULL
1265	      && (! scrub_m68k_mri
1266		  || (ch != '!' && ch != '*')))
1267	    goto de_fault;
1268	  if (scrub_m68k_mri
1269	      && (ch == '!' || ch == '*' || ch == '#')
1270	      && state != 1
1271	      && state != 10)
1272	    goto de_fault;
1273	  /* Fall through.  */
1274	case LEX_IS_COMMENT_START:
1275#if defined TC_ARM && defined OBJ_ELF
1276	  /* On the ARM, `@' is the comment character.
1277	     Unfortunately this is also a special character in ELF .symver
1278	     directives (and .type, though we deal with those another way).
1279	     So we check if this line is such a directive, and treat
1280	     the character as default if so.  This is a hack.  */
1281	  if ((symver_state != NULL) && (*symver_state == 0))
1282	    goto de_fault;
1283#endif
1284
1285#ifdef TC_ARM
1286	  /* For the ARM, care is needed not to damage occurrences of \@
1287	     by stripping the @ onwards.  Yuck.  */
1288	  if (to > tostart && *(to - 1) == '\\')
1289	    /* Do not treat the @ as a start-of-comment.  */
1290	    goto de_fault;
1291#endif
1292
1293#ifdef WARN_COMMENTS
1294	  if (!found_comment)
1295	    found_comment_file = as_where (&found_comment);
1296#endif
1297	  do
1298	    {
1299	      ch = GET ();
1300	    }
1301	  while (ch != EOF && !IS_NEWLINE (ch));
1302	  if (ch == EOF)
1303	    as_warn (_("end of file in comment; newline inserted"));
1304	  state = 0;
1305	  PUT ('\n');
1306	  break;
1307
1308#ifdef H_TICK_HEX
1309	case LEX_IS_H:
1310	  /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1311	     the H' with 0x to make them gas-style hex characters.  */
1312	  if (enable_h_tick_hex)
1313	    {
1314	      char quot;
1315
1316	      quot = GET ();
1317	      if (quot == '\'')
1318		{
1319		  UNGET ('x');
1320		  ch = '0';
1321		}
1322	      else
1323		UNGET (quot);
1324	    }
1325#endif
1326	  /* Fall through.  */
1327
1328	case LEX_IS_SYMBOL_COMPONENT:
1329	  if (state == 10)
1330	    {
1331	      /* This is a symbol character following another symbol
1332		 character, with whitespace in between.  We skipped
1333		 the whitespace earlier, so output it now.  */
1334	      UNGET (ch);
1335	      state = 3;
1336	      PUT (' ');
1337	      break;
1338	    }
1339
1340#ifdef TC_Z80
1341	  /* "af'" is a symbol containing '\''.  */
1342	  if (state == 3 && (ch == 'a' || ch == 'A'))
1343	    {
1344	      state = 16;
1345	      PUT (ch);
1346	      ch = GET ();
1347	      if (ch == 'f' || ch == 'F')
1348		{
1349		  state = 17;
1350		  PUT (ch);
1351		  break;
1352		}
1353	      else
1354		{
1355		  state = 9;
1356		  if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1357		    {
1358		      if (ch != EOF)
1359			UNGET (ch);
1360		      break;
1361		    }
1362		}
1363	    }
1364#endif
1365	  if (state == 3)
1366	    state = 9;
1367
1368	  /* This is a common case.  Quickly copy CH and all the
1369	     following symbol component or normal characters.  */
1370	  if (to + 1 < toend
1371	      && mri_state == NULL
1372#if defined TC_ARM && defined OBJ_ELF
1373	      && symver_state == NULL
1374#endif
1375	      )
1376	    {
1377	      char *s;
1378	      ptrdiff_t len;
1379
1380	      for (s = from; s < fromend; s++)
1381		{
1382		  int type;
1383
1384		  ch2 = *(unsigned char *) s;
1385		  type = lex[ch2];
1386		  if (type != 0
1387		      && type != LEX_IS_SYMBOL_COMPONENT)
1388		    break;
1389		}
1390
1391	      if (s > from)
1392		/* Handle the last character normally, for
1393		   simplicity.  */
1394		--s;
1395
1396	      len = s - from;
1397
1398	      if (len > (toend - to) - 1)
1399		len = (toend - to) - 1;
1400
1401	      if (len > 0)
1402		{
1403		  PUT (ch);
1404		  memcpy (to, from, len);
1405		  to += len;
1406		  from += len;
1407		  if (to >= toend)
1408		    goto tofull;
1409		  ch = GET ();
1410		}
1411	    }
1412
1413	  /* Fall through.  */
1414	default:
1415	de_fault:
1416	  /* Some relatively `normal' character.  */
1417	  if (state == 0)
1418	    {
1419	      state = 11;	/* Now seeing label definition.  */
1420	    }
1421	  else if (state == 1)
1422	    {
1423	      state = 2;	/* Ditto.  */
1424	    }
1425	  else if (state == 9)
1426	    {
1427	      if (!IS_SYMBOL_COMPONENT (ch))
1428		state = 3;
1429	    }
1430	  else if (state == 10)
1431	    {
1432	      if (ch == '\\')
1433		{
1434		  /* Special handling for backslash: a backslash may
1435		     be the beginning of a formal parameter (of a
1436		     macro) following another symbol character, with
1437		     whitespace in between.  If that is the case, we
1438		     output a space before the parameter.  Strictly
1439		     speaking, correct handling depends upon what the
1440		     macro parameter expands into; if the parameter
1441		     expands into something which does not start with
1442		     an operand character, then we don't want to keep
1443		     the space.  We don't have enough information to
1444		     make the right choice, so here we are making the
1445		     choice which is more likely to be correct.  */
1446		  if (to + 1 >= toend)
1447		    {
1448		      /* If we're near the end of the buffer, save the
1449		         character for the next time round.  Otherwise
1450		         we'll lose our state.  */
1451		      UNGET (ch);
1452		      goto tofull;
1453		    }
1454		  *to++ = ' ';
1455		}
1456
1457	      state = 3;
1458	    }
1459	  PUT (ch);
1460	  break;
1461	}
1462    }
1463
1464  /*NOTREACHED*/
1465
1466 fromeof:
1467  /* We have reached the end of the input.  */
1468  return to - tostart;
1469
1470 tofull:
1471  /* The output buffer is full.  Save any input we have not yet
1472     processed.  */
1473  if (fromend > from)
1474    {
1475      saved_input = from;
1476      saved_input_len = fromend - from;
1477    }
1478  else
1479    saved_input = NULL;
1480
1481  return to - tostart;
1482}
1483