/*
  wiring.c - Partial implementation of the Wiring API
  Originally part of Arduino - http://www.arduino.cc/
  Copyright (c) 2005-2006 David A. Mellis


  Copyright (c) 2018-2021 Spence Konde
  This has been ported to modern AVRs (Arduino team did that)
  Almost every part of it has since been rewritten for
  megaTinyCore and DxCore. This is the megaTinyCore version, and is
  part of megaTinyCore.

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General
  Public License along with this library; if not, write to the
  Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  Boston, MA  02111-1307  USA
*/

#include "wiring_private.h"
#include "util/delay.h"

void init_timers();

#ifndef F_CPU
  #error "F_CPU not defined. F_CPU must always be defined as the clock frequency in Hz"
#endif
#ifndef CLOCK_SOURCE
  #error "CLOCK_SOURCE not defined. Must be 0 for internal, 1 for crystal, or 2 for external clock"
#endif

/*           __PeripheralControl is used to mark peripherals as being "taken over" by the user
 * 0x40 = TIMERD0
 * 0x10 = TIMERA0
 * 0x08 = TIMERA1
 * Implementation and use is not portable between cores - tradeoffs are made which
 * trade generalizability for low resource use
 */
uint8_t __PeripheralControl = 0xFF;

// the prescaler is set so that timer ticks every 64 clock cycles, and the
// the overflow handler is called every 256 ticks.

/* Use prescale appropriate for system clock speed
 * Detect conflict between wiring.c and timers.h if we spot them, as that indicates
 * a defect in the core and would result in extremely bad behavior
 */
#if (F_CPU > 30000000) // use 256 divider when clocked over 30 MHz
  #if defined(MILLIS_USE_TIMERA0) && (TIME_TRACKING_TIMER_DIVIDER != 256)
    #error "wiring.c and timers.h want to set millis timer TCA0 to different divider"
  #endif
  #define TIMERA_PRESCALER_bm (TCA_SPLIT_CLKSEL_DIV256_gc)
#elif (F_CPU > 5000000) // use 64 divider unless it's 5 MHz or under
  #if defined(MILLIS_USE_TIMERA0) && (TIME_TRACKING_TIMER_DIVIDER != 64)
    #error "wiring.c and timers.h want to set millis timer TCA0 to different divider"
  #endif
  #define TIMERA_PRESCALER_bm (TCA_SPLIT_CLKSEL_DIV64_gc)
#elif (F_CPU > 1000000) // anything above 1 MHz
  #if defined(MILLIS_USE_TIMERA0) && (TIME_TRACKING_TIMER_DIVIDER != 16)
    #error "wiring.c and timers.h want to set millis timer TCA0 to different divider"
  #endif
  #define TIMERA_PRESCALER_bm (TCA_SPLIT_CLKSEL_DIV16_gc)
#else /* for 1 MHz and lower */
  #if defined(MILLIS_USE_TIMERA0) && (TIME_TRACKING_TIMER_DIVIDER != 8)
    #error "wiring.c and timers.h want to set millis timer TCA0 to different divider"
  #endif
  #define TIMERA_PRESCALER_bm (TCA_SPLIT_CLKSEL_DIV8_gc)
#endif


#ifndef MILLIS_USE_TIMERNONE

  // volatile uint16_t microseconds_per_timer_overflow;
  // volatile uint16_t microseconds_per_timer_tick;
  // overflow count is tracked for all timer options, even the RTC
  struct sTimer {
    uint8_t            intClear;
    volatile uint8_t  *intStatusReg;
  };

  #if defined(MILLIS_USE_TIMERRTC)
    #define MILLIS_TIMER_VECTOR (RTC_CNT_vect)
    const struct sTimer _timerS = {RTC_OVF_bm, &RTC.INTCTRL};
  #else
    // when TCD0 is used as millis source, this will be different from above, but 99 times out of 10, when a piece of code asks for clockCyclesPerMicrosecond(), they're asking about CLK_PER/CLK_MAIN/etc, not the unprescaled TCD0!


    #if defined (MILLIS_USE_TIMERA0)
      #ifndef TCA0
        #error "Selected millis timer, TCA0 does not exist on this part."
      #endif
      #if defined(TCA_BUFFERED_3PIN)
        #define MILLIS_TIMER_VECTOR (TCA0_OVF_vect)
        const struct sTimer _timerS = {TCA_SINGLE_OVF_bm,  &TCA0.SINGLE.INTFLAGS};
      #else
        #define MILLIS_TIMER_VECTOR (TCA0_HUNF_vect)
        const struct sTimer _timerS = {TCA_SPLIT_HUNF_bm,  &TCA0.SPLIT.INTFLAGS};
      #endif

    #elif defined (MILLIS_USE_TIMERA1)
      #ifndef TCA1
        #error "Selected millis timer, TCA1 does not exist on this part."
      #endif
      #define MILLIS_TIMER_VECTOR (TCA1_HUNF_vect)
      const struct sTimer _timerS = {TCA_SPLIT_HUNF_bm,  &TCA1.SPLIT.INTFLAGS};

    #elif defined(MILLIS_USE_TIMERB0)
      #ifndef TCB0
        #error "Selected millis timer, TCB0 does not exist on this part."
      #endif
      #define MILLIS_TIMER_VECTOR (TCB0_INT_vect)
      static volatile TCB_t *_timer = &TCB0;
      const struct sTimer _timerS = {TCB_CAPT_bm, &TCB0.INTFLAGS};

    #elif defined(MILLIS_USE_TIMERB1)
      #ifndef TCB1
        #error "Selected millis timer, TCB1 does not exist on this part."
      #endif
      #define MILLIS_TIMER_VECTOR (TCB1_INT_vect)
      static volatile TCB_t *_timer = &TCB1;
      const struct sTimer _timerS = {TCB_CAPT_bm, &TCB1.INTFLAGS};

    #elif defined(MILLIS_USE_TIMERB2)
      #ifndef TCB2
        #error "Selected millis timer, TCB2 does not exist on this part."
      #endif
      #define MILLIS_TIMER_VECTOR (TCB2_INT_vect)
      static volatile TCB_t *_timer = &TCB2;
      const struct sTimer _timerS = {TCB_CAPT_bm, &TCB2.INTFLAGS};

    #elif defined(MILLIS_USE_TIMERB3)
      #ifndef TCB3
        #error "Selected millis timer, TCB3 does not exist on this part."
      #endif
      #define MILLIS_TIMER_VECTOR (TCB3_INT_vect)
      static volatile TCB_t *_timer = &TCB3;
      const struct sTimer _timerS = {TCB_CAPT_bm, &TCB3.INTFLAGS};

    #elif defined(MILLIS_USE_TIMERB4)
      #ifndef TCB4
        #error "Selected millis timer, TCB4 does not exist on this part."
      #endif
      #define MILLIS_TIMER_VECTOR (TCB4_INT_vect)
      static volatile TCB_t *_timer = &TCB4;
      const struct sTimer _timerS = {TCB_CAPT_bm, &TCB4.INTFLAGS};

    #elif defined(MILLIS_USE_TIMERD0)
      #ifndef TCD0
        #error "Selected millis timer, TCD0, is only valid for 1-series tinyAVR"
      #endif
      #define MILLIS_TIMER_VECTOR (TCD0_OVF_vect)
      const struct sTimer _timerS = {TCD_OVF_bm, &TCD0.INTFLAGS};

    #else
      #error "No millis timer selected, but not disabled - can't happen!".
    #endif  /* defined(MILLIS_USE_TIMER__) */
  #endif  /* defined(MILLIS_USE_TIMERRTC) */

  #define ClockCyclesToMicroseconds(__a__) ((__a__) / (F_CPU / 1000000L))
  #define FRACT_MAX (1000)
  #define FRACT_INC (ClockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF)%1000)
  #define MILLIS_INC (ClockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF)/1000)

  struct sTimeMillis {
    #if (defined(MILLIS_USE_TIMERRTC) || defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4))     // Now TCB as millis source does not need fraction
      volatile uint32_t timer_millis;   // That's all we need to track here
    #else                               // TCAx or TCD0
      //volatile uint16_t timer_fract;
      //volatile uint32_t timer_millis;
      //volatile uint32_t timer_overflow_count;
      volatile uint32_t timer_overflow_count;
      volatile uint32_t timer_millis;
      volatile uint16_t timer_fract;

    #endif
  } timingStruct;

  // Now for the ISRs. This gets a little bit more interesting now...
  #if defined (MILLIS_USE_TIMERRTC)
    ISR(MILLIS_TIMER_VECTOR) {
      if (RTC.INTFLAGS & RTC_OVF_bm) {
        timingStruct.timer_millis += 64000;
      }
      RTC.INTFLAGS = RTC_OVF_bm | RTC_CMP_bm; // clear flag
    }
  #else
    ISR(MILLIS_TIMER_VECTOR, ISR_NAKED) {
      // Common Interrupt header for TCB, TCA and TCD;
      // Clears the Timer Interrupt flag and pushes the CPU Registers
      // 7 words / 7 clocks
      __asm__ __volatile__(
      "push       r24"            "\n\t" // Free up one more register to load values into
      "ldi        r24, %[CLRFL]"  "\n\t" // This is the TCB interrupt clear bitmap
      "sts   %[PTCLR],      r24"  "\n\t" // write to Timer interrupt status register to clear flag. 2 clocks for sts
      "in         r24,     0x3F"  "\n\t" // Load SREG
      "push       r24"            "\n\t" // and push it on the Stack
      "push       r30"            "\n\t" // First we make room for the pointer to timingStruct by pushing the Z registers
      "push       r31"            "\n\t" //
      ::  [CLRFL] "M" (_timerS.intClear),
          [PTCLR] "m" (*_timerS.intStatusReg));

    #if (defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4))
      __asm__ __volatile__(
      "ld         r24,        Z"  "\n\t" // Z points to LSB of timer_millis, load the LSB
      #if (F_CPU > 2000000)            // if it's 1 or 2 MHz, millis timer overflows every 2ms, intentionally sacrificing resolution for reduced time spent in ISR
      "subi       r24,     0xFF"  "\n\t" // sub immediate 0xFF is the same as to add 1. (There is no add immediate instruction, except add immediate to word)
      #else
      "subi       r24,     0xFE"  "\n\t" // sub immediate 0xFE is the same as to add 2
      #endif
      "st           Z,      r24"  "\n\t" // Store incremented value back to Z
      "ldd        r24,      Z+1"  "\n\t" // now load the next higher byte
      "sbci       r24,     0xFF"  "\n\t" // because this is sbci, it treats carry bit like subtraction, and unless we did just roll over with the last byte,
      "std        Z+1,      r24"  "\n\t" // carry will be cleared. Thus, sbci 0xFF after a subi pressed into service to add, is the same as adc r1 after an add
      "ldd        r24,      Z+2"  "\n\t" // which is what we want.
      "sbci       r24,     0xFF"  "\n\t" // This gets repeated...
      "std        Z+2,      r24"  "\n\t" //
      "ldd        r24,      Z+3"  "\n\t" //
      "sbci       r24,     0xFF"  "\n\t" //
      "std        Z+3,      r24"  "\n\t" // ... until all 4 bytes were handled, at 4 clocks and 3 words per byte -> 16 clocks
      :: "z" (&timingStruct.timer_millis)
      ); // grrr, sublime highlights this as invalid syntax because it gets confused by the ifdef's and odd syntax on inline asm


      /* ISR in C:
        ISR (TCBx_INT_vect) {       // x depends on user configuration
          #if (F_CPU > 2000000)
            timer_millis += 1;
          #else
            timer_millis += 2;
          #endif
          _timer->INTFLAGS = TCB_CAPT_bm;   // reset Interrupt flag of TCBx
        }
      */
    #else // TCA0 or TCD0, also naked
    /*
      __asm__ __volatile__(
      // ISR prologue (overall 10 words / 10 clocks (+ loading of Z)):
      "push       r25"            "\n\t" // one extra Register needed
      // timer_fract handling (8 words / 10 clocks):
      "ldd        r24,      Z+8"  "\n\t" // lo8(timingStruct.timer_fract).
      "ldd        r25,      Z+9"  "\n\t" // hi8(timingStruct.timer_fract)
      "subi       r24,%[LFRINC]"  "\n\t" // use (0xFFFF - FRACT_INC) and use the lower and higher byte to add by subtraction
      "sbci       r25,%[HFRINC]"  "\n\t" // can't use adiw since FRACT_INC might be >63
      "std        Z+8,      r24"  "\n\t" // lo8(timingStruct.timer_fract)
      "std        Z+9,      r25"  "\n\t" // hi8(timingStruct.timer_fract)
      "subi       r24,%[LFRMAX]"  "\n\t" // subtract FRACT_MAX and see if it is lower
      "sbci       r25,%[HFRMAX]"  "\n\t" //

      #if MILLIS_INC != 0                // (6 words / 4 - 5 clocks, branches were optimize to create minimal diversion)
      "brlo    higher"            "\n\t" // if FRAC_MAX was not reached,
      "ldi        r24, %[MIINC]"  "\n\t" // load "normal" MILLIS_INC (0x00-MILLIS_INC)
      "rjmp      sub4"            "\n\t" // avoid overwriting r24
      "higher:"
      #else                              // (4 words, 2 - 4 clocks)
      "brlo   sub_end"            "\n\t" // if we know at compile time that MILLIS_INC is 0,
      #endif                             // we don't have to check it at runtime, saving two insn (tst, branch)

      "std        Z+8,      r24"  "\n\t" // Overwrite the just stored value with the decremented value
      "std        Z+9,      r25"  "\n\t" // seems counter-intuitive, but it requires less registers
      "ldi        r24, %[MINCD]"  "\n\t" // load MILLIS_INC that was decreased by 1 (0xFF-MILLIS_INC)

      // timer_millis handling (12 words / 16 clocks):
      "sub4:"
      "ldd        r25,      Z+4"  "\n\t" // lo16.lo8(timingStruct.timer_millis)
      "sub        r25,      r24"  "\n\t" //
      "std        Z+4,      r25"  "\n\t" //
      "ldd        r25,      Z+5"  "\n\t" // lo16.hi8(timingStruct.timer_millis)
      "sbci       r25,     0xFF"  "\n\t" //
      "std        Z+5,      r25"  "\n\t" //
      "ldd        r25,      Z+6"  "\n\t" // hi16.lo8(timingStruct.timer_millis)
      "sbci       r25,     0xFF"  "\n\t" //
      "std        Z+6,      r25"  "\n\t" //
      "ldd        r25,      Z+7"  "\n\t" // hi16.hi8(timingStruct.timer_millis)
      "sbci       r25,     0xFF"  "\n\t" //
      "std        Z+7,      r25"  "\n\t" //
      "sub_end:"                         // only used if MILLIS_INC == 0
      // timer_overflow_count handling (12 words / 16 clocks):
      "ldd        r25,      Z+0"  "\n\t" // lo16.lo8(timingStruct.timer_overflow_count)
      "subi       r25,     0xFF"  "\n\t" //
      "std        Z+0,      r25"  "\n\t" //
      "ldd        r25,      Z+1"  "\n\t" // lo16.hi8(timingStruct.timer_overflow_count)
      "sbci       r25,     0xFF"  "\n\t" //
      "std        Z+1,      r25"  "\n\t" //
      "ldd        r25,      Z+2"  "\n\t" // hi16.lo8(timingStruct.timer_overflow_count)
      "sbci       r25,     0xFF"  "\n\t" //
      "std        Z+2,      r25"  "\n\t" //
      "ldd        r25,      Z+3"  "\n\t" // hi16.hi8(timingStruct.timer_overflow_count)
      "sbci       r25,     0xFF"  "\n\t" //
      "std        Z+3,      r25"  "\n\t" //
      // ISR epilogue (7 words / 15/16 clocks):
      "pop        r25"            "\n\t"  // new: total 72 - 74 clocks, 55 words / 53 - 75 clocks and 53 words with MILLIS_INC == 0
      :: "z" (&timingStruct),             // old: total 77 - 79 clocks total, and 58 words, vs 104-112 clocks and 84 words
        [LFRINC] "M" (((0x0000 - FRACT_INC)    & 0xFF)),
        [HFRINC] "M" (((0x0000 - FRACT_INC)>>8 & 0xFF)),
        [LFRMAX] "M" ((FRACT_MAX    & 0xFF)),
        [HFRMAX] "M" ((FRACT_MAX>>8 & 0xFF)),
        [MIINC]  "M" ((0x0000 - MILLIS_INC) & 0xFF),
        [MINCD]  "M" ((0xFFFF - MILLIS_INC) & 0xFF)
      );
  */

      __asm__ __volatile__(
      // ISR prologue (overall 10 words / 10 clocks (+ loading of Z)):
      "push       r25"            "\n\t" // second byte
      // timer_overflow_count handling (4 words / 4 + (18) + 1 = 23 clocks):
      "set"                       "\n\t" // remember to go back here
      "ldi        r24,     0xFF"  "\n\t" // first byte to be subtracted of 4, rest will be 0xFF
      "rjmp      sub4"            "\n\t" // jump down to sub/sbci
      "ovf_end:"                  "\n\t" // jump back to here afterwards
      "clt"                       "\n\t" // make sure to not jump back again

      // timer_fract handling (8 words / 10 clocks) (Z += 4):
      "ldd        r24,      Z+4"  "\n\t" // lo8(timingStruct.timer_fract).
      "ldd        r25,      Z+5"  "\n\t" // hi8(timingStruct.timer_fract)
      "subi       r24,%[LFRINC]"  "\n\t" // use (0xFFFF - FRACT_INC) and use the lower and higher byte to add by subtraction
      "sbci       r25,%[HFRINC]"  "\n\t" // can't use adiw since FRACT_INC might be >63
      "std        Z+4,      r24"  "\n\t" // lo8(timingStruct.timer_fract)
      "std        Z+5,      r25"  "\n\t" // hi8(timingStruct.timer_fract)
      "subi       r24,%[LFRMAX]"  "\n\t" // subtract FRACT_MAX and see if it is lower
      "sbci       r25,%[HFRMAX]"  "\n\t" //

      #if MILLIS_INC > 0                 // (6 words / 4 - 5 clocks, branches were optimize to create minimal diversion)
      "brlo    higher"            "\n\t" // if FRAC_MAX was not reached,
      "ldi        r24, %[MIINC]"  "\n\t" // load "normal" MILLIS_INC (0x00-MILLIS_INC)
      "rjmp      sub4"            "\n\t" // avoid overwriting r24
      "higher:"
      #else                              // (4 words, 2 - 4 clocks)
      "brlo   sub_end"            "\n\t" // if we know at compile time that MILLIS_INC is 0,
      #endif                             // we don't have to check it at runtime, saving two insn (tst, branch)

      "std        Z+4,      r24"  "\n\t" // Overwrite the just stored value with the decremented value
      "std        Z+5,      r25"  "\n\t" // seems counter-intuitive, but it requires less registers
      "ldi        r24, %[MINCD]"  "\n\t" // load MILLIS_INC that was decreased by 1 (0xFF-MILLIS_INC)

      // subtracting 4 bytes from a dword (13 words / 17 clocks)
      "sub4:"
      "ld         r25,        Z"  "\n\t" // lo16.lo8(timingStruct.timer_millis)
      "sub        r25,      r24"  "\n\t" //
      "st          Z+,      r25"  "\n\t" //
      "ld         r25,        Z"  "\n\t" // lo16.hi8
      "sbci       r25,     0xFF"  "\n\t" //
      "st          Z+,      r25"  "\n\t" //
      "ld         r25,        Z"  "\n\t" // hi16.lo8
      "sbci       r25,     0xFF"  "\n\t" //
      "st          Z+,      r25"  "\n\t" //
      "ld         r25,        Z"  "\n\t" // hi16.hi8
      "sbci       r25,     0xFF"  "\n\t" //
      "st          Z+,      r25"  "\n\t" //
      "brts   ovf_end"            "\n\t" // If T bit is set, we need to go back up
      "sub_end:"
      // ISR epilogue (7 words / 15/16 clocks):
      "pop        r25"            "\n\t"
      :: "z" (&timingStruct),
        [LFRINC] "M" (((0x0000 - FRACT_INC)    & 0xFF)),
        [HFRINC] "M" (((0x0000 - FRACT_INC)>>8 & 0xFF)),
        [LFRMAX] "M" ((FRACT_MAX    & 0xFF)),
        [HFRMAX] "M" ((FRACT_MAX>>8 & 0xFF)),
        [MIINC]  "M" ((0x0000 - MILLIS_INC) & 0xFF),
        [MINCD]  "M" ((0xFFFF - MILLIS_INC) & 0xFF)
      );
    #endif /* (defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4)) */
    // Common ISR Epilogue for TCA, TCB and TCD, popping register in reverse Order
    // 6 words, 14 clocks
    __asm__ __volatile__(
      "pop        r31"            "\n\t"
      "pop        r30"            "\n\t" // 6 more clocks popping registers in reverse order.
      "pop        r24"            "\n\t" // pop r24 to get the old SREG value - 2 clock
      "out       0x3F,      r24"  "\n\t" // restore SREG - 1 clock
      "pop        r24"            "\n\t"
      "reti"                      "\n\t" // and 4 clocks for reti
      ::
      );


    }
  #endif /* defined (MILLIS_USE_TIMERRTC)*/


  /*  Both millis and micros must take great care to prevent any kind of backward time travel.
   *
   * These values are unsigned, and should not decrease, except when they overflow. Hence when
   * we compare a value with what we recorded previously and find the new value to be lower, it
   * looks the same as it would 2^32 (4.2 billion) intervals in the future. Timeouts end prematurely
   * and similar undesired behaviors occur.
   *
   * There are three hazardous things we read here:
   * timer_millis, timer_overflow_count, and the timer count itself (TCxn.CNT).
   * The normal variables need only be read with interrupts disabled, in case of an
   * interrupt writing to it while we were reading it. AVRs are little-endian, so this would result
   * in the low byte being read before the overflow and the high byte after, and hence a value
   * higher than it should be for that call. Subsequent calls would return the right value.
   *
   * In the case of the timer value, it is more complicated.
   * Here, the hardware at first glance seems to protect us (see "reading 16-bit registers" in the
   * datasheet). But the register gets read in the interrupt, so we still need those disabled.
   * There is an additional risk though that we get a value from after the timer has overflowed
   * and since we disabled interrupts, the interrupt hasn't updated the overflow. We check the
   * interrupt flag, and if it's set, we check whether the timer value we read was near overflow
   * (the specifics vary by the timer - they're very different timers). If it isn't close to overflow
   * but the flag is set, we must have read it after the overflow, so we compensate for the missed
   * interrupt. If interrupts are disabled for long enough, this heuristic will be wrong, but in
   * that case it is the user's fault, as this limitation is widely known and documentedm, as well
   * as unavoidable. Failure to compensate looks like the inverse of the above case.
   *
   * (note that only micros reads the timer, and hence, only micros can experience backwards time
   * travel due to interrupts being left disabled for too long, millis will just stop increasing.
   *
   * Both of these cause severe breakage everywhere. The first type is simple to avoid, but if
   * missed can be more subtle, since it makes a big difference only if the byte where the read
   * was interrupted rolled over. The second type is more obvious, potentially happening on every timer
   * overflow, instead of just every 256th timer overflow, and when it does happen, anything waiting
   * for a specific number of microseconds to pass that gets that value will do so.
   * Though (see delay below) each incidence only short-circuits one ms of delay(), not the whole
   * thing.
   *
   * All time time travel except for glitchs from disabling millis for too long should no longer
   * be possible. If they are, that is a critical bug.
   */


  unsigned long millis() {
    // return timer_overflow_count; // for debugging timekeeping issues where these variables are out of scope from the sketch
    unsigned long m;
    // disable interrupts while we read timer_millis or we might get an
    // inconsistent value (e.g. in the middle of a write to timer_millis)
    uint8_t oldSREG = SREG;
    cli();
    #if defined(MILLIS_USE_TIMERRTC)
      uint16_t rtccount = RTC.CNT;
      m = timingStruct.timer_millis;
      if (RTC.INTFLAGS & RTC_OVF_bm) {
        /* There has just been an overflow that hasn't been accounted for by the interrupt. Check if the high bit of counter is set.
         * We just basically need to make sure that it didn't JUST roll over at the last couple of clocks. But this merthod is
         * implemented very efficiently (just an sbrs) so it is more efficient than other approaches. If user code is leaving
         * interrupts off nearly 30 seconds, they shouldn't be surprised. */
        if (!(rtccount & 0x8000)) m += 64000;
      }
      SREG = oldSREG;
      m += rtccount - (rtccount >> 5) + (rtccount >> 7);
    #else
      m = timingStruct.timer_millis;
      SREG = oldSREG;
    #endif
    return m;
  }
  #if !defined(MILLIS_USE_TIMERRTC)
  unsigned long micros() {
    uint32_t overflows, microseconds;
    #if (defined(MILLIS_USE_TIMERD0) || (defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4)))
      uint16_t ticks;
    #else /* TCA */
      uint8_t ticks;
    #endif
    uint8_t flags;
    /* Save current state and disable interrupts */
    uint8_t oldSREG = SREG;
    cli(); /* INTERRUPTS OFF */
    #if defined(MILLIS_USE_TIMERA0)
      ticks = TCA0.SPLIT.HCNT;
      flags = TCA0.SPLIT.INTFLAGS;
    #elif defined(MILLIS_USE_TIMERD0)
      TCD0.CTRLE = TCD_SCAPTUREA_bm;
      while (!(TCD0.STATUS & TCD_CMDRDY_bm)); // wait for sync - should be only one iteration of this loop
      flags = TCD0.INTFLAGS;
      ticks = TCD0.CAPTUREA;
    #else
      ticks = _timer->CNT;
      flags = _timer->INTFLAGS;
    #endif // end getting ticks
    /* If the timer overflow flag is raised, and the ticks we read are low, then the timer has rolled over but
     * ISR has not fired. If we already read a high value of ticks, either we read it just before the overflow,
     * so we shouldn't increment overflows, or interrupts are disabled and micros isn't expected to work so it
     * doesn't matter.
     * Get current number of overflows and timer count */
    #if !((defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4)))
      overflows = timingStruct.timer_overflow_count;
    #else
      overflows = timingStruct.timer_millis;
    #endif
    /* Turn interrupts back on, assuming they were on when micros was called. */
    SREG = oldSREG; /* INTERRUPTS ON */
    #if defined(MILLIS_USE_TIMERD0)
      if ((flags & TCD_OVF_bm) && (ticks < 0x07)) {
    #elif defined(MILLIS_USE_TIMERA0)
      ticks = (TIME_TRACKING_TIMER_PERIOD) - ticks;
      if ((flags & TCA_SPLIT_HUNF_bm) && (ticks < 0x04)) {
    #else // timerb
      if ((flags & TCB_CAPT_bm) && !(ticks & 0xFF00)) {
    #endif
    #if ((defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4)) && !(F_CPU > 2000000UL))
      overflows +=2;
    #else
      overflows++;
    #endif
      } // end getting ticks

    #if defined(MILLIS_USE_TIMERD0)
      #if (F_CPU == 20000000UL || F_CPU == 10000000UL || F_CPU == 5000000UL)
        uint8_t ticks_l = ticks >> 1;
        ticks = ticks + ticks_l + ((ticks_l >> 2) - (ticks_l >> 4) + (ticks_l >> 7));
        // + ticks +(ticks>>1)+(ticks>>3)-(ticks>>5)+(ticks>>8))
        // speed optimization via doing math with smaller datatypes, since we know high byte is 1 or 0.
        microseconds =   overflows * (TIME_TRACKING_CYCLES_PER_OVF / 20) + ticks; // ticks value corrected above.
      #else
        microseconds = ((overflows * (TIME_TRACKING_CYCLES_PER_OVF / 16))
                          + (ticks * (TIME_TRACKING_CYCLES_PER_OVF / 16 / TIME_TRACKING_TIMER_PERIOD)));
      #endif
      #if defined(CLOCK_TUNE_INTERNAL) && !(F_CPU == 16000000UL || F_CPU ==  20000000UL || F_CPU ==  8000000UL || F_CPU ==  10000000UL || F_CPU ==  4000000UL || F_CPU ==  5000000UL)
        #warning "TCD is not supported as a millis timing source when the oscillator is tuned to a frequency other than 16 or 20 MHz. Timing results will be wrong - use TCA0 or a TCB."
      #endif
    #elif (defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4))
      /* Ersatz Division for TCBs - now with inline assembly!
       *
       * It's well known that division is an operator you want to avoid like the plague on AVR.
       * Not only is it slow, the execution time isn't even easy to analyze - it depends on the
       * two opperands, particularly the divisor... so you can't just look at the generated
       * assembly and count clock cycles, you've got to either time it expoerimentally with
       * a representative set of sample data, or know how many times it will pass through the
       * loops and then count clock cycles. If the operands aren't constant (if they are, you
       * can probably manage to get it optimized away at compile time) your best hope is likely
       * simulation, assuming you know enough about the values it will end up having to divide.
       *
       * Anyway. You don't want to be doing division. But that's what you need in order to
       * appropriately scale the ick count from the prescaler-deprived TCBs. Since many users
       * reconfigure the TCA for advanced PWM, using the TCA-prescaled clock was a non-starter
       * particularly since many parts have only one TCA. But division can be approximated
       * very closely if the divisor is constant using a series of bitshifts and addition/subtraction.
       *
       * The series of shifts was determined numerically with a spreadsheet that calculated the results for
       * each value that could come from the initial round of rightshifts for any combination of
       * bitshifts and provided a number of statistics to select based on. Backwards time travel must
       * never happenb, or if it does, it must be a shorter backward step than micros runtime - 1 us
       * otherwise delay() will break and timeouts can instantly expire when it is hit. Similarly,
       * one wants to avoid large jumps forward, and cases where more consecutive "actual" times
       * than absolutely necessary return the same value (time should flow at a constant rate).
       * Finally, the artifacts of the calculation that are unavoidable should be distributed uniformly.
       * Undershooting or overshooting 999 endpoint at the counter's maximum value is the usual
       * source of large jumps (at the overflow point) in either direction. Terms should, as much as
       * possible alternate between positive and negative to minimize artifacts.
       *
       * The most popular/important speeds are hand-implemented in assembly because the compiler
       * was doing a miserable job of it - wasting 20-30% of the execution time and it's one of the few
       * Arduino API functions that users will be surprised and dismayed to find running slowiy.
       * Not only does it run faster on "normal" boards (the 16 MHz clock eliminates the need to divide
       * DxCore offers many speeds where the math doesn't all optimize away to nothing like it does at
       * 1/2/4/8/16/32.
       *
       * Do notice that we are replacing a smaller number of terms, and it's still much faster
       * The 10's went from 5 term ersatz-division to 6, while 12's went from 5 terms to 9, yet still
       * got a lot faster. The terrible twelves are the frequency most difficult to do this with.
       * Ironically, one of the the two that are is easiest is 36, which is good enough with 3 and
       * effectively exact (That "middle 12" is closer than the other 12's get with 9!)
       * 25 also matches it. Maybe it's something 25 and 36 being perfect squares?
       *
       * The three problems were that:
       * 1. Compiler generated code stubbornly insisted doing repeated shift operation in a loop
       * with 3 cycle per iteration (the shift itself took only 2)
       * 2. Compiler could not be convinced to do things that we know will always be < 255 as
       * bytes. Sure, it wouldn't know that - it's not legal for it to do that on it's own.
       * But even when I cast everything to uint8_t, it would shift a 16-bit value around
       * unnecessarily.
       * 3. It would distribute the ticks >> 4. That is, it wouldn't shift the value of
       * ticks in place, even though it wasn't referenced after this because I was assigning
       * the result to ticks, and hence the old value was "dead"
       * Instead, it would copy it, shift the copy 3 or 4 places. Then when it needed the
       * ticks >> 2, it would make a copy of the ORIGINAL and shift that 6 places,
       * instead of copying the copy and shifting just 2 places.
       *
       * A general trend seems to be that the compiler is not smart enough to "reuse" an
       * existing value that has already been shifted such that it's closer to the target.
       * at least for multi-byte variables. This is not the worst example of it I've run into
       * but the micros() function is a little bit sensitive to the execution time.
       * Apparently people sometimes want to *do something* in response to the value it
       * returns - and they seem to want to do that in a timely manner, otherwise they'd
       * have not bothered to record a time so accurately...
       *
       * general algorithm in the assembly implementations is:
       * start with ticks in a register pair, copy to r0, r1.
       * rightshift it until we have the 0th term (closest power of 2).
       * copy it to back to original location..
       * continue rightshifting it, adding or subtracting from the original when we reach
       * the appropriate terms.
       * As soon as we've rightshifted the original enough times that we know it's < 256,
       * we switch from lsr r1 ror r0 to just lsr r0. At the next term that we want to add
       * we copy it to r1. Subsequent subtractions or additions are single-byte until we've got the last term.
       * this time, we add r1 to r0 instead of the other way around.
       * we will need to clear r1 anyway, but we do it now, since we need a known 0 to do the carry.
       * we addthat to the ticks intermediate value to get the final ticks value, and drop back into C
       * where we calculate overflows * 1000, the (now 0-999) ticks to it, and return it.
       *
       */
        // Oddball clock speeds
      #if   (F_CPU == 44000000UL) // Extreme overclocking
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks - /* (ticks >> 1)  + */ (ticks >> 2) - (ticks >> 5) + /* (ticks >> 6) - */ (ticks >> 7)); // + (ticks >> 10)
      #elif (F_CPU == 36000000UL) // 50% overclock!
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks - (ticks >> 3) + (ticks >> 6)); // - (ticks >> 9) + (ticks >> 10) // with 5 terms it is DEAD ON
      #elif (F_CPU == 28000000UL) // Not supported by DxCore - nobody wants it.
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks + (ticks >> 2) - (ticks >> 3) + (ticks >> 5) - (ticks >> 6)); // + (ticks >> 8) - (ticks >> 9)
      #elif (F_CPU == 14000000UL) // Not supported by DxCore - nobody wants it.
        ticks = ticks >> 3;
        microseconds = overflows * 1000 + (ticks + (ticks >> 2) - (ticks >> 3) + (ticks >> 5) - (ticks >> 6)); // + (ticks >> 8) - (ticks >> 9)
      #elif (F_CPU == 30000000UL) // Easy overclock
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks + (ticks >> 3) - (ticks >> 4) + (ticks >> 7) - (ticks >> 8)); // 5 terms is the optimal. Good but not as close as we get for most.
      #elif (F_CPU == 27000000UL) // You'd think this one would be a flaming bitch right?
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks + (ticks >> 2) - (ticks >> 4) - (ticks >> 9)); // +0.1 average error with only 4 terms, minimal scatter... that's just not supposed to happen!
      #elif (F_CPU == 25000000UL) // Barely overclocked.
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks + /* (ticks >> 1) -*/ (ticks >> 2) + /* (ticks >> 4) -*/ (ticks >> 5)); // DEAD ON with 5 terms

      /* The Terrible Twelves (or threes) - Twelve may be a great number in a lot of ways... but here, it's actually 3 in disguise.
       * NINE TERMS in the damned bitshift division expansion. And the result isn't even amazing. - it's worse than what can be done
       * with just 5 terms for dividing by 36 or 25, or a mere 3 terms with 27... where you're dividing by 9, 12.5, and 13.5 respectively,
       * or after the initial shifts, by 0.78125, 1.25 or 1.18, and comparable to the best series for division by 1.375 (44 MHz) or 0.9375 (30 MHz) which each have 7 terms,
       * though it's better than the best possible for the division by 0.875 associated with 28 MHs clocks which is also a 7 term one.
       * This is division by 0.75, which sounds like it should be the easiest out of the lot.
       *
       * This does the following:
       * ticks = ticks >> (1, 2, 3, 4, or 6 for 3 MHz, 6 MHz, 12 MHz, 24 MHz, or 48 MHz)
       * ticks = ticks + (ticks >> 1) - (ticks >> 2) + (ticks >> 3) - (ticks >> 4) + (ticks >> 5) - (ticks >> 6) + (ticks >> 7) - (ticks >> 9)
       *
       * Equivalent to :
       * ticks = ticks / (1.5, 3, 6, 12, or 24)
       *
       * Division is way too slow, but we need to convert current timer ticks, which
       * are are 0-2999, 0-5999, 0-11999, or 0-23999 into the 3 least significant digits
       * of the number of microseconds so that it can be added to overflows * 1000.
       *
       * Runtime of the assembly is 28, 30, 32, or 34 clocks
       * 3 and 6 MHz not a supported speed.
       * 57 replaced with 30 save 27 clocks @ 12 = 2 us saved
       * 67 replaced with 32 save 35 clocks @ 24 = 1.5us saved
       * 77 replaced with 34 save 43 clocks @ 48 = 1 us saved
       */
      #elif (F_CPU == 48000000UL || F_CPU == 24000000UL || F_CPU == 12000000UL || F_CPU == 6000000UL || F_CPU == 3000000UL)
        __asm__ __volatile__(
          "movw r0,%A0"   "\n\t" // we copy ticks to r0 (temp_reg) and r1 (zero_reg) so we don't need to allocate more registers.
          "lsr r1"        "\n\t" // notice how at first, each shift takes insns. Compiler wants to use an upper register, ldi number of shifts
          "ror r0"        "\n\t" // into it, then lsr, ror, dec, breq (4 insn + 5 clocks per shift, and including the ldi, it's 5 insns + 5*shiftcount clocks)
          #if (F_CPU != 3000000UL)
            "lsr r1"        "\n\t"
            "ror r0"        "\n\t"
          #endif
          #if (F_CPU == 12000000UL || F_CPU == 24000000UL || F_CPU == 48000000UL)
            "lsr r1"      "\n\t"  // sacrifice 1 word for 9 clocks on the 12 MHz configuration
            "ror r0"      "\n\t"
          #endif
          #if (F_CPU == 24000000UL || F_CPU == 48000000UL)
            "lsr r1"      "\n\t"  // sacrifice 3 words for 12 clocks on the 24 MHz configuration
            "ror r0"      "\n\t"
          #endif
          #if (F_CPU == 48000000UL)
            "lsr r1"      "\n\t"  // sacrifice 5 words for 15 clocks on the 48 MHz configuration.
            "ror r0"      "\n\t"
          #endif
          "movw %A0,r0"   "\n\t"  // This is the value we call ticks, because that's what it was in old code.
          "lsr r1"        "\n\t"  // we just copied the now shifted value back to original location.
          "ror r0"        "\n\t"  // 2 words per shift still
          "add %A0, r0"   "\n\t"  // we now have ticks >> 1, add it to original.
          "adc %B0, r1"   "\n\t"  //
          "lsr r1"        "\n\t"  //
          "ror r0"        "\n\t"  // we now have ticks >> 2. Now it's under 250, and r1 is 0
          "mov r1,r0"     "\n\t"  // so we copy the remaining value into r1.
          "lsr r1 "       "\n\t"  // now it's only 1 insn/shift!
          "sub r0,r1"     "\n\t"  // - ticks >> 3
          "lsr r1"        "\n\t"
          "add r0,r1"     "\n\t"  // + ticks >> 4
          "lsr r1"        "\n\t"
          "sub r0,r1"     "\n\t"  // - ticks >> 5
          "lsr r1"        "\n\t"
          "add r0,r1"     "\n\t"  // + ticks >> 6
          "lsr r1"        "\n\t"
          "sub r0,r1"     "\n\t"  // - ticks >> 7
          "lsr r1"        "\n\t"
          "lsr r1"        "\n\t"
          "add r0,r1"     "\n\t"  // + ticks >> 9
          "eor r1,r1"     "\n\t"  // clear out r1
          "sub %A0,r0"    "\n\t"  // Add the sum of terms that fit in a byte to what was ticks in old code.
          "sbc %B0,r1"    "\n"    // carry - see,this is why AVR needs a known zero.
          : "+r" (ticks));        // Do the rest in C. ticks is a read/write operand.
        microseconds = overflows * 1000 + ticks; // nice and clean.

      /* The Troublesome Tens - I initially fumbled this after the **now** r1 is 0 line
       * I did several dumb things - at first I thought it was my pointless moving and
       * adding. But the real problem was that on that line, I'd just deleted the
       * now unnecessary lsr r1, leaving the next as ror instead of lsr. So instead of pushing
       * that bit into the void, it came back as the new high bit, causing the device to travel
       * back in time. Unfortunately, a few hundred milliseconds isn't far back enough to
       * snag a winning ticket for todays lotto, but more than than the execution time
       * of micros is far enough back to thoroughly break delay() Even if I could just go back
       * just far enough to tell myself where the bug was, I'd take it...
       *
       * This does the following:
       * ticks = ticks >> (1, 2, 3, or 4 for 5 MHz, 10 MHz, 20 MHz, or 40 MHz)
       * ticks = ticks - (ticks >> 2) + (ticks >> 4) - (ticks >> 6) + (ticks >> 8)
       *
       * Equivalent to:
       * ticks = tick / (2.5, 5, 10, or 20)
       * Division is way too slow, but we need to convert current timer ticks, which
       * are 0-2499, 0-4999, 0-9999, or 0-19999, into the 3 least significant digits
       * of the number of microseconds so that it can be added to overflows * 1000.
       *
       * Runtime is 23,25,27, or 29 clocks, savings vs the best I could do in C
       *
       * 33 replaced with 23 save 10 clocks @ 5  = 2 us saved
       * 46 replaced with 25 save 21 clocks @ 10 = 2.5 us saved
       * 56 replaced with 27 save 29 clocks @ 20 = 1.5 us saved
       * 66 replaced with 29 save 37 clocks @ 40 = 1 us saved
       */
      #elif (F_CPU == 40000000UL || F_CPU == 20000000UL || F_CPU == 10000000UL || F_CPU == 5000000UL)
        __asm__ __volatile__(
          "movw r0,%A0"   "\n\t"  // no savings until after the initial rightshifts at 5 MHz
          "lsr r1"        "\n\t"
          "ror r0"        "\n\t"
          #if (F_CPU == 10000000UL || F_CPU == 20000000UL || F_CPU == 40000000UL)
            "lsr r1"      "\n\t"  // sacrifice 1 word for 9 clocks at 10 MHz
            "ror r0"      "\n\t"
          #endif
          #if (F_CPU == 20000000UL || F_CPU == 40000000UL)
            "lsr r1"      "\n\t"  // sacrifice 3 words for 12 clocks at 20 MHz
            "ror r0"      "\n\t"
          #endif
          #if (F_CPU == 40000000UL)
            "lsr r1"      "\n\t"  // sacrifice 5 words for 15 clocks at 40 MHz
            "ror r0"      "\n\t"
          #endif
          "movw %A0,r0"   "\n\t"  // ticks
          "lsr r1"        "\n\t"
          "ror r0"        "\n\t"
          "lsr r1"        "\n\t"
          "ror r0"        "\n\t"  //   ticks >> 2.
          "sub %A0, r0"   "\n\t"  // - ticks >> 2
          "sbc %B0, r1"   "\n\t"  // It could be 312 so we can't do what we did for the 12's
          "lsr r1"        "\n\t"
          "ror r0"        "\n\t"  // **now** r1 is 0.
          "lsr r0"        "\n\t"
          "mov r1,r0"     "\n\t"  // + ticks >> 4
          "lsr r1"        "\n\t"
          "lsr r1"        "\n\t"
          "sub r0,r1"     "\n\t"  // - ticks >> 6
          "lsr r1"        "\n\t"
          "lsr r1"        "\n\t"
          "add r0,r1"     "\n\t"  // + ticks >> 8
          "eor r1,r1"     "\n\t"  // restore zero_reg
          "add %A0,r0"    "\n\t"  // add to the shifted ticks
          "adc %B0,r1"    "\n"    // carry
          : "+r" (ticks));        // Do the rest in C. ticks is a read/write operand.
        microseconds = overflows * 1000 + ticks;
      /* replaces:
      #elif (F_CPU == 48000000UL) // Extreme overclocking
        ticks = ticks >> 5;
        microseconds = overflows * 1000 + (ticks + (ticks >> 2) + (ticks >> 3) - (ticks >> 5)); // - (ticks >> 7)
      #elif (F_CPU == 24000000UL) // max rated speed
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks + (ticks >> 2) + (ticks >> 3) - (ticks >> 5)); // - (ticks >> 7)
      #elif (F_CPU == 12000000UL)
        ticks = ticks >> 3;
        microseconds = overflows * 1000 + (ticks + (ticks >> 2) + (ticks >> 3) - (ticks >> 5)); // - (ticks >> 7)
      // Never was an implementation for 3 or 6, but it's obvious what the old style implementation would be,
      #elif (F_CPU == 40000000UL) // overclocked aggressively
        ticks = ticks >> 4;
        microseconds = overflows * 1000 + (ticks - (ticks >> 2) + (ticks >> 4) - (ticks >> 6)); // + (ticks >> 8)
      #elif (F_CPU == 20000000UL)
        ticks = ticks >> 3;
        microseconds = overflows * 1000 + (ticks - (ticks >> 2) + (ticks >> 4) - (ticks >> 6)); // + (ticks >> 8)
      #elif (F_CPU == 10000000UL)
        ticks = ticks >> 2;
        microseconds = overflows * 1000 + (ticks - (ticks >> 2) + (ticks >> 4) - (ticks >> 6)); // + (ticks >> 8)
      #elif (F_CPU ==  5000000UL)
        ticks = ticks >> 1;
        microseconds = overflows * 1000 + (ticks - (ticks >> 2) + (ticks >> 4) - (ticks >> 6)); // + (ticks >> 8)
      */

      // powers of 2  - and a catchall for parts without dedicated implementations. It gives wrong results, but
      // it also doesn't take forever like doing division would.
      #elif (F_CPU  == 32000000UL || F_CPU > 24000000UL)
        microseconds = overflows * 1000 + (ticks >> 4);
      #elif (F_CPU  == 16000000UL || F_CPU > 12000000UL)
        microseconds = overflows * 1000 + (ticks >> 3);
      #elif (F_CPU  ==  8000000UL || F_CPU >  6000000UL)
        microseconds = overflows * 1000 + (ticks >> 2);
      #elif (F_CPU  ==  4000000UL || F_CPU >= 3000000UL)
        microseconds = overflows * 1000 + (ticks >> 1);
      #else //(F_CPU == 1000000UL || F_CPU == 2000000UL) - here clock is running at system clock instead of half system clock.
            // and hence overflows only once per 2ms. On 2 MHz
            // also works at 2MHz, since we use CLKPER for 1MHz vs CLKPER/2 for all others.
        microseconds   = overflows * 1000 + ticks;
      #endif
      #if !((F_CPU == 48000000UL || F_CPU == 36000000UL || F_CPU == 24000000UL || F_CPU == 12000000UL || /* multiples of 12           */ \
             F_CPU == 40000000UL || F_CPU == 30000000UL || F_CPU == 20000000UL || F_CPU == 10000000UL || /* multiples of 10           */ \
             F_CPU == 32000000UL || F_CPU == 16000000UL || F_CPU ==  8000000UL || F_CPU ==  4000000UL || /* powers of 2               */ \
             F_CPU ==  2000000UL || F_CPU ==  1000000UL || F_CPU == 25000000UL || F_CPU ==  5000000UL || /* powers of 2 cont, 25, 5   */ \
             F_CPU == 44000000UL || F_CPU == 28000000UL || F_CPU == 14000000UL || F_CPU ==  3000000UL || /* oddball frequencies       */ \
             F_CPU == 27000000UL)&& /* warn fools who messed with the timers.h file too and expected that the core would sort out how */ \
            ((TIME_TRACKING_TIMER_DIVIDER == 2 && TIME_TRACKING_TICKS_PER_OVF == F_CPU/2000) || /*how to make the timer work correctly*/ \
             (TIME_TRACKING_TIMER_DIVIDER == 1 && (TIME_TRACKING_TICKS_PER_OVF == F_CPU/500 && F_CPU == 1000000) || (TIME_TRACKING_TICKS_PER_OVF == F_CPU/1000 && F_CPU == 2000000))))
                                                 /*  without them implementing it. No such luck  */
        #warning "Millis timer (TCBn) at this frequency and/or configuration unsupported, micros() will return totally bogus values."
      #endif
    #else // Done with TCB
      #if (F_CPU == 30000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + ((ticks * 2) + ((uint16_t)(ticks >> 3)));
      #elif (F_CPU == 28000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + ((ticks * 2) + ((uint16_t)(ticks >> 2) + (ticks >> 5)));
      #elif (F_CPU == 25000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + (ticks * 2 + ((uint16_t)(ticks >> 1) + (ticks >> 4)));
      #elif (F_CPU == 24000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + (ticks * 3 - ((uint16_t)(ticks >> 2) - (ticks >> 4) - (ticks >> 5)));
      #elif (F_CPU == 20000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + (ticks * 3 + ((uint16_t)(ticks >> 2) - (ticks >> 4)));
      #elif (F_CPU == 28000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + (ticks * 4 + ((uint16_t)(ticks >> 1) + (ticks >> 4) + (ticks >> 5)));
      #elif (F_CPU == 12000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + (ticks * 5 + ((uint16_t)(ticks >> 2) + (ticks >> 4) + (ticks >> 5)));
      #elif (F_CPU == 10000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 64)
        microseconds = (overflows * clockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
            + ((ticks << 3) - ((uint16_t)(ticks << 1) + (ticks >> 1) - (ticks >> 3)));
      #elif (F_CPU == 5000000UL && TIME_TRACKING_TICKS_PER_OVF == 255 && TIME_TRACKING_TIMER_DIVIDER == 16)
        microseconds = (overflows * millisClockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
                     + (ticks * 3 + ((uint16_t)(ticks >> 2) - (ticks >> 4)));
      #else
        #if (TIME_TRACKING_TIMER_DIVIDER%(F_CPU/1000000))
          #warning "Millis timer (TCA0) at this frequency unsupported, micros() will return bogus values."
        #endif
        microseconds = ((overflows * millisClockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF))
                      + (ticks * (millisClockCyclesToMicroseconds(TIME_TRACKING_CYCLES_PER_OVF) / TIME_TRACKING_TIMER_PERIOD)));
      #endif
    #endif // end of timer-specific part of micros calculations
    return microseconds;
  }
  #else // MILLIS_USE_TIMERRTC is defined, so we don't have micros
    /* We do not have a timebase sufficiently accurate to give microsecond timing. In fact, we barely have millisecond timing available
     * The microsecond delay counts clock cycles, and so it does still work. It is planned that a future library will switch the millis
     * pause millis before sleeping and turn on the RTC, tracking the passage of time to a much coarser resolution with that, and turn
     * it back on when waking from sleep, so people can keep time while sleeping without sacrificing micros().
     * In any event, as of 2.4.3 we now provide the stub below, which we hope is more useful than being told that micros() isn't defined.
     */
      unsigned long micros() {
        badCall("microsecond timekeeping is not supported when the RTC is used as the sole timekeeping timer (though delayMicroseconds() is)");
        return -1;
      }
    #endif
#else // MILLIS_USE_TIMERNONE defined - we have neither of these functions.
    /* Uses should not call millis() or micros() if the core timekeeping has been disabled. Usually, encountering this error either means
     * that they disabled millis earlier for some other sketch, and the preferences were saved with that - or that they are using a library
     * with a dependence on the timekeeping facilities. Sometimes these are meaningful, other times it is only for a feature that isn't
     * being used, or to catch a particular corner case (see: tinyNeoPixel, very end of show() for an example).
     * As of 2.4.3 we provide the stubs below, which we hope is more useful than being told that millis or micros isn't defined.
     */
    unsigned long micros() {
      badCall("micros() is not available because it has been disabled through the tools -> millis()/micros() menu");
      return -1;
    }
    unsigned long millis() {
      badCall("millis() is not available because it has been disabled through the tools -> millis()/micros() menu");
      return -1;
    }
#endif // MILLIS_USE_TIMERNONE code


/* delay()
 * So what do you WANT in a good delay function?
 * First, obviously you want it to delay things. You do not want it to block interrupts (then a long one would throw off
 * timekeeping, miss inputs, and so on). And you want the compiled size to not be prohibitive for the part.
 * The reason it's so important wrt. interrupts is that in Arduino standard delay(), if an interrupt fires in the middle,
 * will still end at the same time - it is "interrupt insensitive". Whenever a delay is using the builtin _delay_ms()
 * if that is interrupted it has no way of knowing time has passed. Now hopefully you're not spending so much time in
 * an ISR that this is significant, but it is still undesirable.
 *
 * For the unfortunate souls using small-flash parts, the flash usage becomes a major problem - why is it such a space-hog?
 * Because it has to pull in micros(), which is bulky even with the division turned into bitshifts... RTC has same problem
 * with millis(), the conversion of 1024ths of a second to 1000ths is a killer, even with the bitshift tricks,
 * and the compiler seems really stupid about how it handles it; I can't keep it from making an extra copy of the 32-bit
 * value, which ALSO requires 4 more push and pop operations to get registers it can use.
 *
 * Now we will use one of three delay() implementations:
 * If you have 16k+ your delay is the standard one, it pulls in micros(), yes, but you may well already have grabbed
 *  that for your sketch already, and the delay is more accurate and fully interrupt insensitive, and you can afford
 *  the memory. For RTC users they will get the analogous implementation that is based on millis.
 * Users with millis disabled, or with less than 16k flash and using RTC will get the implementation based on _delay_ms().
 * Everyone else (flash under 16k but millis enabled via non-RTC timer) will get the light version which calls _delay_ms()
 *  if the delay is under 16 ms to get less flash usage, and calculates the delay using **millis** not micros otherwise,
 *  saving over 100b of flash. The reason for the split is that the limited granularity of millis introduces an error in
 *  the delay duration of up to 1ms. That doesn't matter much when you call delay(1000) on an internal clock that's within
 *  1% on a good day. It matters greatly when you call delay(1);    */

#if defined(MILLIS_USE_TIMERNONE) || (PROGMEM_SIZE < 16384 && defined(MILLIS_USE_TIMERRTC))
  void delay(uint32_t ms) { /* Interrupts will prolong this delay */
    if (__builtin_constant_p(ms)) {
      _delay_ms(ms);
    } else {
      while (ms--) {
        _delay_ms(1);
      }
    }
  }
#elif (PROGMEM_SIZE >= 16384 && !defined(MILLIS_USE_TIMERRTC))
  void delay(uint32_t ms) { /* Interrupts will not prolong this less flash-efficient delay */
    uint16_t start = (uint16_t) micros();
    while (ms > 0) {
      while (((uint16_t) micros() - start) >= 1000 && ms) {
        ms-- ;
        start += 1000;
      }
    }
  }
#else
  void delay(uint32_t ms) {
    if (__builtin_constant_p(ms) && ms < 16) {
      _delay_ms(ms);
    } else if (ms < 16) {
      while(ms--) {
        _delay_ms(1);
      }
    } else {
      uint32_t start = millis();
      while (millis() - start < ms);
    }
  }
#endif

inline __attribute__((always_inline)) void delayMicroseconds(unsigned int us) {
  // This function gets optimized away, but to what depends on whether us is constant.
  if (__builtin_constant_p(us)) {
    _delay_us(us); // Constant microseconds use the avr-libc _delay_us() which is highly accurate for all values and efficient!
  } else { // If it is not, we have to use the Arduino style implementation.
    _delayMicroseconds(us);
  }
}

/* delayMicroseconds() when delay is not a compile-time known constant.
 * Delay for the given number of microseconds. This is UGLY AS SIN and explicitly depends on function call
 * overhead for very short delays.
 * High clock speeds shouldn't return immediately for a 1us delay - we can instead only drop a fraction of a us
 * 48, 44, 40, and 32 drop 1/2 us, and 36 drops 2/3rds.
 * Note that us ceases to be in units of microseconds as soon as the function is entered; it gets turned into the loop counter.
 * Then we use a minimal number of bitshifts to calculate the number of passes through the delay loop
 * and subtract the number of loop-cycles of time we burned doing so. But need to be careful that sane values
 * don't get so much bigger that they overflow the unsigned int we're storing it in. To that end, we use
 * a longer loop at faster clock speeds.
 * In the inline assembly, when a delay of 8 clocks or longer is required, we save flash with a clever trick:
 *  "rjmp .+2" "\n\t"     // 2 cycles - jump over the return.
 *  "ret" "\n\t"          // 4 cycles - rjmped over initially...
 *  "rcall .-4" "\n\t"    // 2 cycles - ... but then called here...
 * This exploits the fact that return is a 4-clock instruction (even on AVRxt) by first hopping over a return
 * then immediately calling that return instruction - 8 clocks in 3 words. Once the ret is there, additional
 * rcall instructions can get 6 clocks in a single word, though we only get to take advantage of that once for
 * the 30 MHz case and any longer delays do better with a loop.
 */

#if   F_CPU >= 48000000L
  // 16 MHz math, 12-cycle loop, 1us burns and passes through loop twice.
  #define DELAYMICROS_TWELVE
#elif F_CPU >= 44000000L
  // 16 MHz math, 11-cycle loop, 1us burns and passes through loop twice.
  #define DELAYMICROS_ELEVEN
#elif F_CPU >= 40000000L
  // 20 MHz math, 10-cycle loop, 1us burns and passes through loop twice.
  #define DELAYMICROS_TEN
#elif F_CPU >= 36000000L
  // 12 MHz math, 12-cycle loop, 1us burns and passes through loop once.
  #define DELAYMICROS_TWELVE
#elif F_CPU >= 32000000L
  // 16 MHz math, 8-cycle loop, 1us passes through loop twice.
  #define DELAYMICROS_EIGHT
#elif F_CPU >= 30000000L
  // 12 MHz math, 10-cycle loop, 1us burns and returns.
  #define DELAYMICROS_TEN
#elif F_CPU >= 28000000L
  // 16 MHz math, 7-cycle loop, 1us burns and returns.
  #define DELAYMICROS_SEVEN
#elif F_CPU >= 27000000L
  // 12 MHz math, 9 cycle loop, 1us burns and returns
  #define DELAYMICROS_NINE
#elif F_CPU >= 24000000L
  // 12 MHz math, 8-cycle loop, 1us burns and returns.
  #define DELAYMICROS_EIGHT
#elif F_CPU >= 20000000L
  // 20 MHz math, 10-cycle loop, 1us burns and returns.
  #define DELAYMICROS_TEN
#elif F_CPU >= 16000000L
  // 16 MHz math, 4-cycle loop, 1us returns immediately.
#elif F_CPU >= 12000000L
  // 16 MHz math, 4-cycle loop, 1us returns immediately.
#elif F_CPU >= 10000000L || (F_CPU >= 5000000L && F_CPU < 8000000L)
  // 10 MHz: 5-cycle loop, 1us returns immediately
  // 5 MHz: 5-cycle loop, 1-3 us returns immediately.
  #define DELAYMICROS_FIVE
#else
  // 8 MHz: 16 MHz math, 4-cycle loop, 1-2 us returns immediately.
  // 4 MHz: 16 MHz math, 4-cycle loop, 1-4 us returns immediately.
  // 2 MHz: 16 MHz math, 4-cycle loop, 1-8 us returns immediately.
  // 1 MHz: 16 MHz math, 4-cycle loop, < 16 us returns immediately, < 25 burns and returns.
  // Anything not listed uses the fastest one that is and which is slower than F_CPU
#endif

__attribute__ ((noinline)) void _delayMicroseconds(unsigned int us) {
 /* Must be noinline because we rely on function-call overhead */
#if F_CPU == 48000000L
  // make the initial delay 24 cycles
  __asm__ __volatile__ (
    "rjmp .+2" "\n\t"     // 2 cycles - jump over next instruction.
    "ret" "\n\t"          // 4 cycles - rjmped over initially....
    "rcall .-4");         // 2 cycles - ... but then called here);
                          // wait 8 cycles with 3 words
  // the loop takes 1/4 of a microsecond (12 cycles) per iteration
  // so execute it four times for each microsecond of delay requested.
  us <<= 2; // x4 us, = 4 cycles
  // we only burned ~22 cycles above, subtraction takes another 2 - so we've lost
  // half a us and only need to drop 2 rounds through the loop!
  us -= 2; // = 2 cycles,

#elif F_CPU >= 44000000L
  // Again, we can do all this in half of 1 us, so we
  // just pass through the loop 2 times for 1 us delay.
  __asm__ __volatile__(
    "rjmp .+0" "\n\t"     // 2 cycles
    "rjmp .+0" "\n\t"     // 2 cycles
    "nop");               // 1 cycles
                          // Wait 5 cycles in 3 words.
  // the loop takes 1/4 of a microsecond (11 cycles) per iteration
  // so execute it four times for each microsecond of delay requested.
  us <<= 2; // x4 us, = 4 cycles
  // we just burned 19 (21) cycles above, remove 2
  // us is at least 8 so we can subtract 2
  us -= 2;

#elif F_CPU >= 40000000L
  // Again, we can do all this in half of 1 us, so we
  // just pass through the loop 2 times for 1 us delay.
  __asm__ __volatile__(
    "rjmp .+0" "\n\t"     // 2 cycles
    "nop");               // 1 cycles
                          // Wait 3 cycles in 2 words.
  // the loop takes 1/4 of a microsecond (10 cycles) per iteration
  // so execute it four times for each microsecond of delay requested.
  us <<= 2; // x4 us, = 4 cycles
  // we just burned 17 (19) cycles above, remove 2.
  // us is at least 8 so we can subtract 2
  us -= 2;

#elif F_CPU >= 36000000L
  // Here we get the initial delay is about 24 cycles, so we pass through
  // the loop once for 1us delay.
  __asm__ __volatile__ (
    "rjmp .+0" "\n\t"     // 2 cycles
    "rjmp .+2" "\n\t"     // 2 cycles - jump over next instruction.
    "ret" "\n\t"          // 4 cycles - rjmped over initially....
    "rcall .-4");         // 2 cycles - ... but then called here);
                          // wait 10 cycles in 4 words
  // the loop takes 1/3 of a microsecond (12 cycles) per iteration
  // so execute it three times for each microsecond of delay requested.
  us = (us << 1) + us; // x3 us, = 5 cycles
  // we just burned 23 (25) cycles above, remove 2
  us -= 2; // 2 cycles

#elif F_CPU >= 32000000L
  // here, we only take half a us at the start
  __asm__ __volatile__ ("rjmp .+0");
                          // wait 2 cycles
  // in by the end of this section.
  // the loop takes 1/4 of a microsecond (8 cycles) per iteration
  // so execute it four times for each microsecond of delay requested.
  us <<= 2; // x4 us, = 4 cycles
  // we only burned ~14 cycles above, subtraction takes another 2 - so we've lost half a us,
  // and only need to drop 2 rounds through the loop!
  us -= 2; // = 2 cycles

#elif F_CPU >= 30000000L
  // for a one-microsecond delay, burn 14 cycles and return
  __asm__ __volatile__ (
    "rjmp .+2" "\n\t"     // 2 cycles - jump over the return.
    "ret" "\n\t"          // 4 cycles - rjmped over initially...
    "rcall .-4" "\n\t"    // 2 cycles - ... but then called here...
    "rcall .-6");         // 2+4 cycles - ... and here again!
                          // Waiting 14 cycles in only 4 words
  if (us <= 1) return; // = 3 cycles, (4 when true)
  // the loop takes 1/3 of a microsecond (10 cycles) per iteration
  // so execute it three times for each microsecond of delay requested.
  us = (us << 1) + us; // x3 us, = 5 cycles
  // we just burned 28 (30) cycles above, remove 3
  us -= 3; // 2 cycles

#elif F_CPU >= 28000000L
  // for a one-microsecond delay, burn 12 cycles and return
  __asm__ __volatile__ (
    "rjmp .+0" "\n\t"     // 2 cycles
    "rjmp .+0" "\n\t"     // 2 cycles
    "rjmp .+2" "\n\t"     // 2 cycles - jump over next instruction.
    "ret" "\n\t"          // 4 cycles - rjmped over initially....
    "rcall .-4");         // 2 cycles - ... but then called here);
                          // wait 12 cycles in 5 words
  if (us <= 1) return; //  = 3 cycles, (4 when true)

  // the loop takes 1/4 of a microsecond (7 cycles) per iteration
  // so execute it four times for each microsecond of delay requested.
  us <<= 2; // x4 us, = 4 cycles=
  // we just burned 27 (29) cycles above, remove 4, (7*4=28)
  // us is at least 8 so we can subtract 5
  us -= 4; // = 2 cycles,

#elif F_CPU >= 27000000L
  // for a one-microsecond delay, burn 11 cycles and return
  __asm__ __volatile__ (  // wait 8 cycles with 3 words
    "rjmp .+2" "\n\t"     // 2 cycles - jump over next instruction.
    "ret" "\n\t"          // 4 cycles - rjmped over initially....
    "rcall .-4" "\n\t"    // 2 cycles - ... but then called here);
    "rjmp .+0" "\n\t"     // 2 cycles
    "nop");               // 1 more  == 11 total
  if (us <= 1) return;    //  = 3 cycles, (4 when true)

  // the loop takes 1/3 of a microsecond (8 cycles) per iteration
  // so execute it three times for each microsecond of delay requested.
  us = (us << 1) + us; // x3 us, = 5 cycles
  // we just burned 27 (24) cycles above, remove 3
  us -= 3; // 2 cycles


#elif F_CPU >= 24000000L
  // for a one-microsecond delay, burn 8 cycles and return
  __asm__ __volatile__ (
    "rjmp .+2" "\n\t"     // 2 cycles - jump over next instruction.
    "ret" "\n\t"          // 4 cycles - rjmped over initially....
    "rcall .-4");         // 2 cycles - ... but then called here);
                          // wait 8 cycles with 3 words
  if (us <= 1) return; //  = 3 cycles, (4 when true)

  // the loop takes 1/3 of a microsecond (8 cycles) per iteration
  // so execute it three times for each microsecond of delay requested.
  us = (us << 1) + us; // x3 us, = 5 cycles
  // we just burned 24 (22) cycles above, remove 3
  us -= 3; // 2 cycles

#elif F_CPU >= 20000000L
  // for a one-microsecond delay, burn 4 clocks and then return
  __asm__ __volatile__ (
    "rjmp .+0" "\n\t"     // 2 cycles
    "nop" );              // 1 cycle
                          // wait 3 cycles with 2 words
  if (us <= 1) return; //  = 3 cycles, (4 when true)
  // the loop takes a 1/2 of a microsecond (10 cycles) per iteration
  // so execute it twice for each microsecond of delay requested.
  us = us << 1; // x2 us, = 2 cycles
  // we just burned 21 (23) cycles above, remove 2
  // us is at least 4 so we can subtract 2.
  us -= 2; // 2 cycles

#elif F_CPU >= 16000000L
  // for a one-microsecond delay, simply return.  the overhead
  // of the function call takes 14 (16) cycles, which is 1us
  if (us <= 1) return; //  = 3 cycles, (4 when true)
  // the loop takes 1/4 of a microsecond (4 cycles) per iteration
  // so execute it four times for each microsecond of delay requested.
  us <<= 2; // x4 us, = 4 cycles
  // we just burned 19 (21) cycles above, remove 5, (5*4=20)
  // us is at least 8 so we can subtract 5
  us -= 5; // = 2 cycles

#elif F_CPU >= 12000000L
  // for a 1 microsecond delay, simply return.  the overhead
  // of the function call takes 14 (16) cycles, which is 1.5us
  if (us <= 1) return; //  = 3 cycles, (4 when true)
  // the loop takes 1/3 of a microsecond (4 cycles) per iteration
  // so execute it three times for each microsecond of delay requested.
  us = (us << 1) + us; // x3 us, = 5 cycles
  // we just burned 20 (22) cycles above, remove 5, (5*4=20)
  // us is at least 6 so we can subtract 5
  us -= 5; // 2 cycles

#elif F_CPU >= 10000000L
  // for a 1 microsecond delay, simply return.  the overhead
  // of the function call takes 14 (16) cycles, which is 1.5us
  if (us <= 2) return; //  = 3 cycles, (4 when true)
  // the loop takes 1/2 of a microsecond (5 cycles) per iteration
  // so execute it 2 times for each microsecond of delay requested.
  us = us << 1; // x2 us, = 2 cycles
  // we just burned 20 (22) cycles above, remove 4, (5*4=20)
  // us is at least 6 so we can subtract 4
  us -= 4; // 2 cycles

#elif F_CPU >= 8000000L
  // for a 1 and 2 microsecond delay, simply return.  the overhead
  // of the function call takes 14 (16) cycles, which is 2us
  if (us <= 2) return; //  = 3 cycles, (4 when true)
  // the loop takes 1/2 of a microsecond (4 cycles) per iteration
  // so execute it twice for each microsecond of delay requested.
  us <<= 1; // x2 us, = 2 cycles
  // we just burned 17 (19) cycles above, remove 5, (4*5=20)
  // us is at least 6 so we can subtract 4
  us -= 5; // = 2 cycles

#elif F_CPU >= 5000000L
  // for a 1 ~ 3 microsecond delay, simply return.  the overhead
  // of the function call takes 14 (16) cycles, which is 3us
  if (us <= 3) return; // 3 cycles, (4 when true)
  // the loop takes 1 microsecond (5 cycles) per iteration
  // so just remove 3 loops for overhead
  us -= 3; // = 2 cycles

#elif F_CPU >= 4000000L
  // for a 1 ~ 4 microsecond delay, simply return.  the overhead
  // of the function call takes 14 (16) cycles, which is 4us
  if (us <= 4) return; // 3 cycles, (4 when true)
  // the loop takes 1 microsecond (4 cycles) per iteration,
  // just remove 4 loops for overhead
  us -= 4; // = 2 cycles for the time taken up with call overhead and test above

#elif F_CPU >= 2000000L
  // for a 1 ~ 4 microsecond delay, simply return.  the overhead
  // of the function call takes 14 (16) cycles, which is 8us
  if (us <= 8) return; // 3 cycles, (4 when true)
  // the loop takes 2 microsecond (4 cycles) per iteration,
  // just remove 4 loops for overhead
  us >>= 1; // divide by 2.
  us -= 4; // = 2 cycles for the time taken up with call overhead and test above

#else // F_CPU >= 1000000
  // for the 1 MHz internal clock (default settings for common AVR microcontrollers)
  // the overhead of the function calls is 14 (16) cycles
  if (us <= 16) return; // 3 cycles, (4 when true)
  if (us <= 25) return; // 3 cycles, (4 when true), (must be at least 26 if we want to subtract 22 and rightshift twice.)
  // compensate for the time taken by the preceding and following commands (about 22 cycles)
  us -= 22; // = 2 cycles
  // the loop takes 4 microseconds (4 cycles)
  // per iteration, so execute it us/4 times
  // us is at least 4, divided by 4 gives us 1 (no zero delay bug)
  us >>= 2; // us div 4, = 4 cycles
#endif
/* Implementation of the delay loop of 4, 5, 7, 8, 10, 11, or 12 clocks. */
#if defined(DELAYMICROS_TWELVE)
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "rjmp .+2"      "\n\t"            // 2 cycles - jump over next instruction.
    "ret"           "\n\t"            // 4 cycles - rjmped over initially....
    "rcall .-4"     "\n\t"            // 2 cycles - ... but then called here
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#elif defined(DELAYMICROS_ELEVEN)
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "nop"           "\n\t"            // 1 cycle
    "rjmp .+0"      "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#elif defined(DELAYMICROS_TEN)
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#elif defined(DELAYMICROS_NINE)
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "nop"           "\n\t"
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#elif defined(DELAYMICROS_EIGHT)
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#elif defined(DELAYMICROS_SEVEN)
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "rjmp .+0"      "\n\t"            // 2 cycles
    "nop"           "\n\t"            // 1 cycle
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#elif defined(DELAYMICROS_FIVE)
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "nop"           "\n\t"            // 1 cycle
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#else // the classic 4 cycle delay loop...
  __asm__ __volatile__ (
    "1: sbiw %0, 1" "\n\t"            // 2 cycles
    "brne 1b" : "=w" (us) : "0" (us)  // 2 cycles
  );
#endif
  // return = 4 cycles
}

void stop_millis()
{ // Disable the interrupt:
  #if defined(MILLIS_USE_TIMERNONE)
    badCall("stop_millis() is only valid with millis time keeping enabled.");
  #else
    #if defined(MILLIS_USE_TIMERA0)
      TCA0.SPLIT.INTCTRL &= (~TCA_SPLIT_HUNF_bm);
    #elif defined(MILLIS_USE_TIMERA1)
      TCA1.SPLIT.INTCTRL &= (~TCA_SPLIT_HUNF_bm);
    #elif defined(MILLIS_USE_TIMERD0)
      TCD0.INTCTRL &= 0xFE;
    #elif defined(MILLIS_USE_TIMERRTC)
      RTC.INTCTRL &= 0xFE;
      RTC.CTRLA &= 0xFE;
    #else
      _timer->INTCTRL &= ~TCB_CAPT_bm;
    #endif
  #endif
}


void restart_millis()
{
  // Call this to restart millis after it has been stopped and/or millis timer has been molested by other routines.
  // This resets key registers to their expected states.
  #if defined(MILLIS_USE_TIMERNONE)
    badCall("restart_millis() is only valid with millis time keeping enabled.");
  #else
    #if defined(MILLIS_USE_TIMERA0)
        /* The type A timers need to be restored to the state they were in at the start of restore  */
      TCA0.SPLIT.CTRLA    = 0;                    // timer off (might need that for next steps)
      TCA0.SPLIT.CTRLD    = TCA_SPLIT_SPLITM_bm;  // because this will not work if it's enabled.
      TCA0.SPLIT.HPER     = PWM_TIMER_PERIOD;     // What was left behind
      #if (F_CPU > 25000000)  // use 256 divider when clocked over 25 MHz
        TCA0.SPLIT.CTRLA   = (TCA_SPLIT_CLKSEL_DIV256_gc) | (TCA_SPLIT_ENABLE_bm);
      #elif (F_CPU > 5000000) //  use 64 divider for everything in the middle
        TCA0.SPLIT.CTRLA   =  (TCA_SPLIT_CLKSEL_DIV64_gc) | (TCA_SPLIT_ENABLE_bm);
      #elif (F_CPU > 1000000) // and use 16...
        TCA0.SPLIT.CTRLA   =  (TCA_SPLIT_CLKSEL_DIV16_gc) | (TCA_SPLIT_ENABLE_bm);
      #else                   // or even 8 otherwise for really slow system clocks.
        TCA0.SPLIT.CTRLA   =   (TCA_SPLIT_CLKSEL_DIV8_gc) | (TCA_SPLIT_ENABLE_bm);
      #endif
      /* No TCA1 on tinyAVRs */
    #elif defined(MILLIS_USE_TIMERD0)
      TCD0.CTRLA          = 0x00;
      while (TCD0.STATUS & 0x01);
    #elif (defined(MILLIS_USE_TIMERB0) || defined(MILLIS_USE_TIMERB1) || defined(MILLIS_USE_TIMERB2) || defined(MILLIS_USE_TIMERB3) || defined(MILLIS_USE_TIMERB4)) // It's a type b timer
      _timer->CTRLB       = 0;
    #endif
    init_millis();
  #endif
}


void __attribute__((weak)) init_millis()
{
  #if defined(MILLIS_USE_TIMERNONE)
    badCall("init_millis() is only valid with millis time keeping enabled.");
  #else
    #if defined(MILLIS_USE_TIMERA0)
      #if !defined(TCA_BUFFERED_3PIN)
        TCA0.SPLIT.INTCTRL = TCA_SPLIT_HUNF_bm;
      #else
        TCA0.SINGLE.INTCTRL = TCA_SINGLE_OVF_bm;
      #endif
    #elif defined(MILLIS_USE_TIMERA1)
      TCA1.SPLIT.INTCTRL |= TCA_SPLIT_HUNF_bm;
    #elif defined(MILLIS_USE_TIMERD0)
      TCD_t* pTCD;
      _fastPtr_d(pTCD, &TCD0);
      pTCD->CMPBCLR        = TIME_TRACKING_TIMER_PERIOD; // essentially, this is TOP
      pTCD->CTRLB          = 0x00; // oneramp mode
      pTCD->CTRLC          = 0x80;
      pTCD->INTCTRL        = 0x01; // enable interrupt
      pTCD->CTRLA          = TIMERD0_PRESCALER | 0x01; // set clock source and enable!
    #elif defined(MILLIS_USE_TIMERRTC)
      while(RTC.STATUS); // if RTC is currently busy, spin until it's not.
      // to do: add support for RTC timer initialization
      RTC.PER             = 0xFFFF;
      #ifdef MILLIS_USE_TIMERRTC_XTAL
        _PROTECTED_WRITE(CLKCTRL.XOSC32KCTRLA,0x03);
        RTC.CLKSEL        = 2; // external crystal
      #else
        _PROTECTED_WRITE(CLKCTRL.OSC32KCTRLA,0x02);
        // RTC.CLKSEL=0; this is the power on value
      #endif
      RTC.INTCTRL         = 0x01; // enable overflow interrupt
      RTC.CTRLA           = (RTC_RUNSTDBY_bm|RTC_RTCEN_bm|RTC_PRESCALER_DIV32_gc);//fire it up, prescale by 32.
    #else // It's a type b timer - we have already errored out if that wasn't defined
      _timer->CCMP = TIME_TRACKING_TIMER_PERIOD;
      // Enable timer interrupt, but clear the rest of register
      _timer->INTCTRL = TCB_CAPT_bm;
      // Clear timer mode (since it will have been set as PWM by init())
      _timer->CTRLB = 0;
      // CLK_PER/1 is 0b00, . CLK_PER/2 is 0b01, so bitwise OR of valid divider with enable works
      _timer->CTRLA = TIME_TRACKING_TIMER_DIVIDER|TCB_ENABLE_bm;  // Keep this last before enabling interrupts to ensure tracking as accurate as possible
    #endif
  #endif
}

void set_millis(__attribute__((unused))uint32_t newmillis)
{
  #if defined(MILLIS_USE_TIMERNONE)
    badCall("set_millis() is only valid with millis timekeeping enabled.");
    (void)newmillis; // unused parameter
  #else
    #if defined(MILLIS_USE_TIMERRTC)
      uint8_t oldSREG = SREG; // save SREG
      cli();                // interrupts off
      timingStruct.timer_millis = newmillis;
      while(RTC.STATUS&RTC_CNTBUSY_bm); // wait if RTC busy
      RTC.CNT = 0;
      SREG = oldSREG; // re-enable interrupts if we killed them,
    #else
      /* farting around with micros via overflow count was ugly and buggy.
       * may implement again, better, in the future - but millis and micros
       * will get out of sync when you use set_millis
       * I think the way to do it is to make this implementation (but not big one)
       * inline, so if newmillis is constant, we can calculate the (compile-time known)
       * number of overflows using all the floating point math we want, and otherwise,
       * document that it will zero out micros.*/
      timingStruct.timer_millis = newmillis;
    #endif
  #endif
}

void nudge_millis(__attribute__((unused)) uint16_t nudgesize) {
  #if !defined(MILLIS_USE_TIMERNONE)
    uint8_t oldSREG=SREG;
    cli();
    timingStruct.timer_millis += nudgesize;
    SREG=oldSREG;
  #else
    (void)nudgesize; // unused parameter
  #endif
}

void init() {
  // Initializes hardware: First we configure the main clock, then fire up the other peripherals
  init_clock();
  init_ADC0();
  init_timers();
  #ifndef MILLIS_USE_TIMERNONE
    init_millis();
  #endif
  /*************************** ENABLE GLOBAL INTERRUPTS *************************/
  // Finally, after everything is initialized, we go ahead and enable interrupts.
  if (onAfterInit()) {
    sei();
  }
}

  /******************************** CLOCK STUFF *********************************/
#if defined(CLOCK_TUNE_INTERNAL)
  void tune_internal(void);
#endif

void __attribute__((weak)) init_clock() {
  #ifndef CLOCK_SOURCE
    #error "CLOCK_SOURCE not defined. CLOCK_SOURCE must be either 0 (internal) or 2 (external clock)"
  #endif
  #if (CLOCK_SOURCE == 0)
    #if (defined(CLOCK_TUNE_INTERNAL))
      tune_internal(); // Will be inlined as only called once. Just too long and ugly to put two implementations in middle of this.
    #else
      #if (F_CPU == 20000000)
        /* No division on clock */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, 0x00);
      #elif (F_CPU == 16000000)
        /* No division on clock */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, 0x00);
      #elif (F_CPU == 10000000) // 20MHz prescaled by 2
        /* Clock DIV2 */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, (CLKCTRL_PEN_bm | CLKCTRL_PDIV_2X_gc));
      #elif (F_CPU == 8000000) // 16MHz prescaled by 2
        /* Clock DIV2 */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, (CLKCTRL_PEN_bm | CLKCTRL_PDIV_2X_gc));
      #elif (F_CPU == 5000000) // 20MHz prescaled by 4
        /* Clock DIV4 */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, (CLKCTRL_PEN_bm | CLKCTRL_PDIV_4X_gc));
      #elif (F_CPU == 4000000) // 16MHz prescaled by 4
        /* Clock DIV4 */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, (CLKCTRL_PEN_bm | CLKCTRL_PDIV_4X_gc));
      #elif (F_CPU == 2000000) // 16MHz prescaled by 8
        /* Clock DIV8 */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, (CLKCTRL_PEN_bm | CLKCTRL_PDIV_8X_gc));
      #elif (F_CPU == 1000000) // 16MHz prescaled by 16
        /* Clock DIV16 */
        _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, (CLKCTRL_PEN_bm | CLKCTRL_PDIV_16X_gc));
      #else
        #ifndef F_CPU
          #error "F_CPU not defined"
        #else
          #error "F_CPU defined as an unsupported value for untuned internal oscillator"
        #endif
      #endif
    #endif
  #elif (CLOCK_SOURCE == 2)
    _PROTECTED_WRITE(CLKCTRL_MCLKCTRLA, CLKCTRL_CLKSEL_EXTCLK_gc);
    // while (CLKCTRL.MCLKSTATUS & CLKCTRL_SOSC_bm);  // This either works, or hangs the chip - EXTS is pretty much useless here.
    // w/out CFD, easier to determine what happened if we don't just hang here.
    uint8_t count = 10;
    while (CLKCTRL.MCLKSTATUS & CLKCTRL_SOSC_bm && count--);
    if (CLKCTRL.MCLKSTATUS & CLKCTRL_EXTS_bm) {
      _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, 0x00);
    }
  #else
    #error "CLOCK_SOURCE is defined, but it isn't 0 (internal) or 2 (external clock), and those are the only clock sources supported by this part."
  #endif
}


#if defined(CLOCK_TUNE_INTERNAL)
  void tune_internal() {
  #define _CLOCKSPERUS (F_CPU/1000000)
  uint8_t _osccfg; // magic name - do not change
  #if defined(USING_BOOTLOADER) && USING_BOOTLOADER == 1
      // If using Optiboot, then we do not know what value OSCFG was set to when it was bootloaded, so we have to determine it at runtime.
      uint8_t _osccfg = FUSE.OSCCFG - 1; /****** "_osccfg" IS A MAGIC NAME - DO NOT CHANGE IT ******/
  #else
    // if not we set this when the
    #if MEGATINYCORE_SERIES == 2 && (_CLOCKSPERUS > 20 || _CLOCKSPERUS== 12 || _CLOCKSPERUS == 10 || _CLOCKSPERUS == 6 || _CLOCKSPERUS == 5 || _CLOCKSPERUS == 3)
      _osccfg = 1;
    #elif MEGATINYCORE_SERIES < 2 && (_CLOCKSPERUS > 20 || _CLOCKSPERUS== 12 || _CLOCKSPERUS == 10 || _CLOCKSPERUS == 7 ||  _CLOCKSPERUS == 6 || _CLOCKSPERUS == 5 || _CLOCKSPERUS == 3)
      _osccfg = 1;
    #else
      _osccfg = 0;
    #endif
  #endif
  #include "tune_guesses.h"
    // The GUESSCAL, MAX_TUNING, TUNED_CALIBRATION_OFFSET and TUNE_PRESCALE symbols, which look like constants, aren't.
    // They're macros from tune_guesses.h and get replaced with (ternary operators and math involving osccfg), so what looks very simple here... actually isn't.
    // Evertthing hard is done in tune_guesses.h
    if (__builtin_constant_p(TUNED_CALIBRATION_OFFSET)) {
      if (TUNED_CALIBRATION_OFFSET == 255) {
        badCall("It appears that you are attempting to set a 0/1-series part to 32 MHz via tuning or otherwise set a bogus clock speed.");
      }
    }
    if (TUNED_CALIBRATION_OFFSET == 255) {

      GPIOR0 |= 0x80;
      GPIOR0 |= 0x40;
      return; // we can't do that speed at all with this part and oscillator setting! Hopefully users notice their sketch is running
      // way too slow, and will read the docs which contain further instructions for diagnosis of these sort of problems.
    } else {
      uint8_t istuned =(_SFR_MEM8((_osccfg ? 0x1306 : 0x1300) +  CLOCK_TUNE_START + HOME_TUNING_OFFSET)) != 255;
      uint8_t tunedval=_SFR_MEM8(((_osccfg ? 0x1306 : 0x1300) +  CLOCK_TUNE_START + TUNED_CALIBRATION_OFFSET));
      if (!istuned) {
        GPIOR0 |= 0x40;
        int temp = GUESSCAL;
        if (temp > MAX_TUNING) {             // uhoh, if we apply the default guess, we'd be setting it higher than it's maximum value!
          if (MAX_TUNING - temp > 5) {
            GPIOR0 |= 0x80;
            return; // How far away are we? If it's more than 5, give up - better to be obviously broken than non-obviously broken
          }
          tunedval = MAX_TUNING;
        } else if (temp < 0) {               // uhoh, if we apply the default guess, we'd be setting it to a negative value (which would wrap around, resulting in the value being too high..
          if (temp < -5) return;             // How far away are we? If it's more than 5, give up - better to be obviously broken than non-obviously broken
          tunedval = 0;
        } else {
          tunedval = temp;
        }
      } else if (tunedval == 0x80) {
        GPIOR0 |= 0x80;
        return; // this chip was tuned and it's oscillator found to be unable to reach target and/or the chip ceased to be opprate before reaching that speed
        // such that either the tuning sketch crashed or the incredilbly crude sanity checks found that arithmetic produced incorrect results.
      }
      // Udf
      _PROTECTED_WRITE(CLKCTRL_OSC20MCALIBA,tunedval);
      _NOP();
      _NOP();
    }
    _PROTECTED_WRITE(CLKCTRL_MCLKCTRLB, TUNE_PRESCALE);
  }
#endif


/********************************* ADC ****************************************/
void __attribute__((weak)) init_ADC0() {
  ADC_t* pADC;
  _fastPtr_d(pADC, &ADC0);
  #if MEGATINYCORE_SERIES < 2
  /* ADC clock 1 MHz to 1.25 MHz at frequencies supported by megaTinyCore
   * Unlike the classic AVRs, which demand 50~200 kHz, for these, the datasheet
   * spec's 50 kHz to 1.5 MHz. Slower clocks provide better response to high
   * impedance signals, since the sample and hold circuit will be connected
   * to the pin for longer However, we can use the SAMPLEN register to
   * compensate for this!
   * We target a sampling time of 12us, which is a little shorter than the
   * classic AVRs, but the sampling cap is is 5pf instead of 14
   * At clock speeds of 12, 24, and 25 MHz when we are forced to use an divider
   * that leaves us with a markedly slower ADC clock (~750 kHz), we instead use
   * a value of 7, giving us 8 ADC clocks or... around 12us. .
   * As of 2.3.0, this setting is exposed by analogReadDuration()
   * Note that on 0/1-series, the prescale settings are placed powers-of-two
   * apart. On the 2-series and Dx-series, they are MUCH closer together.
   **************************************************************************/
    //                              30 MHz / 32 = 937 kHz,  32 MHz / 32 =  1 MHz.
    #if   F_CPU   > 24000000     // 24 MHz / 16 = 1.5 MHz,  25 MHz / 32 =  780 kHz
      pADC->CTRLC  = ADC_PRESC_DIV32_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #elif F_CPU  >= 12000000    // 16 MHz / 16 = 1.0 MHz,  20 MHz / 16 = 1.25 MHz
      pADC->CTRLC  = ADC_PRESC_DIV16_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #elif F_CPU  >=  6000000    //  8 MHz /  8 = 1.0 MHz,  10 MHz /  8 = 1.25 MHz
      pADC->CTRLC  =  ADC_PRESC_DIV8_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #elif F_CPU  >=  3000000    //  4 MHz /  4 = 1.0 MHz,   5 MHz /  4 = 1.25 MHz
      pADC->CTRLC  =  ADC_PRESC_DIV4_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #else                       //  1 MHz /  2 = 500 kHz - the lowest setting
      pADC->CTRLC  =  ADC_PRESC_DIV2_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #endif
    #if   (F_CPU == 6000000 || F_CPU == 12000000 || F_CPU == 24000000 || F_CPU ==25000000)
      pADC->SAMPCTRL = (7); // 9 ADC clocks, 12 us
    #elif (F_CPU == 5000000 || F_CPU == 10000000 || F_CPU == 20000000)
      pADC->SAMPCTRL = (13);   // 15 ADC clock,s 12 us
    #else
      pADC->SAMPCTRL = (10); // 12 ADC clocks, 12 us
    #endif
    pADC->CTRLD    = ADC_INITDLY_DLY16_gc;
    pADC->CTRLA    = ADC_ENABLE_bm;
  #else
    /* On the 2-series maximum with internal reference is 3 MHz, so we will
     * target highest speed that doesn't exceed that and 16 ADC clocks sample
     * duration. */
    #if F_CPU     > 32000000            // 36 MHz /14 = 2.57 MHz
      pADC->CTRLB  = ADC_PRESC_DIV10_gc; // 33 MHz /14 = 2.35 MHz
    #elif F_CPU  >= 30000000            // 32 MHz /12 = 2.67 MHz
      pADC->CTRLB  = ADC_PRESC_DIV12_gc; // 30 MHz /12 = 2.50 MHz
    #elif F_CPU  >= 24000000            // 25 MHz /10 = 2.50 MHz
      pADC->CTRLB  = ADC_PRESC_DIV10_gc; // 24 MHz /10 = 2.40 MHz
    #elif F_CPU  >= 20000000
      pADC->CTRLB  = ADC_PRESC_DIV8_gc;  // 20 MHz / 8 = 2.50 MHz
    #elif F_CPU  >= 16000000
      pADC->CTRLB  = ADC_PRESC_DIV6_gc;  // 16 MHz / 6 = 2.67 MHz
    #elif F_CPU  >= 12000000
      pADC->CTRLB  = ADC_PRESC_DIV4_gc;  // 12 MHz / 4 = 3.00 MHz
    #elif F_CPU  >=  6000000            // 10 MHz / 4 = 2.50 MHz
      pADC->CTRLB  = ADC_PRESC_DIV4_gc;  //  8 MHz / 4 = 2.00 MHz
    #else                               //  5 MHz / 2 = 2.50 MHz
      pADC->CTRLB  = ADC_PRESC_DIV2_gc;  //  4 MHz / 2 = 2.00 MHz
    #endif                              //  1 MHz / 2 =  500 kHz
    pADC->CTRLE = 15; // 15.5 without PGA, 16 with PGA, corresponding to 7.75 or 8 us.
    pADC->CTRLA = ADC_ENABLE_bm | ADC_LOWLAT_bm;
    /* Default low latency mode on
     * Users can turn it off if they care about power consumption while ADC is on
     * and chip is awake, since these parts don't have the perverse ADC-left-on
     * behavior of classic AVRs. */
    pADC->CTRLC = TIMEBASE_1US; // defined in Arduino.h.
    pADC->PGACTRL = ADC_PGABIASSEL_3_4X_gc | ADC_ADCPGASAMPDUR_15CLK_gc;
    /* Note that we don't *enable* it automatically in init().
     * 3/4th bias is good up to 4 MHz CLK_ADC, 15 ADC Clocks to sample the PGA
     * up to 5 MHz, so within the regime of speeds that have to be compatible
     * with internal references, we are in the clear there. */
  #endif
}

// Must be called manually.
#ifdef ADC1
  __attribute__((weak)) void init_ADC1() {
    ADC_t* pADC;
    _fastPtr_d(pADC, &ADC1);
    //                              30 MHz / 32 = 937 kHz,  32 MHz / 32 =  1 MHz.
    #if   F_CPU   > 24000000     // 24 MHz / 16 = 1.5 MHz,  25 MHz / 32 =  780 kHz
      pADC->CTRLC  = ADC_PRESC_DIV32_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #elif F_CPU  >= 12000000    // 16 MHz / 16 = 1.0 MHz,  20 MHz / 16 = 1.25 MHz
      pADC->CTRLC  = ADC_PRESC_DIV16_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #elif F_CPU  >=  6000000    //  8 MHz /  8 = 1.0 MHz,  10 MHz /  8 = 1.25 MHz
      pADC->CTRLC  =  ADC_PRESC_DIV8_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #elif F_CPU  >=  3000000    //  4 MHz /  4 = 1.0 MHz,   5 MHz /  4 = 1.25 MHz
      pADC->CTRLC  =  ADC_PRESC_DIV4_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #else                       //  1 MHz /  2 = 500 kHz - the lowest setting
      pADC->CTRLC  =  ADC_PRESC_DIV2_gc | ADC_REFSEL_VDDREF_gc | ADC_SAMPCAP_bm;
    #endif
    #if   (F_CPU == 6000000 || F_CPU == 12000000 || F_CPU == 24000000 || F_CPU ==25000000)
      pADC->SAMPCTRL = (7); // 9 ADC clocks, 12 us
    #elif (F_CPU == 5000000 || F_CPU == 10000000 || F_CPU == 20000000)
      pADC->SAMPCTRL = (13);   // 15 ADC clock,s 12 us
    #else
      pADC->SAMPCTRL = (10); // 12 ADC clocks, 12 us
    #endif
    pADC->CTRLD    = ADC_INITDLY_DLY16_gc;
    pADC->CTRLA    = ADC_ENABLE_bm;
  }
#endif

void init_timers()  {
  init_TCA0();
  #if (defined(TCD0) && defined(USE_TIMERD0_PWM) && !defined(MILLIS_USE_TIMERD0))
    init_TCD0();
  #endif
}


#if (defined(TCD0) && defined(USE_TIMERD0_PWM) && !defined(MILLIS_USE_TIMERD0))
void __attribute__((weak)) init_TCD0() {
  TCD_t* pTCD;
  _fastPtr_d(pTCD, &TCD0);
  pTCD->CMPBCLR  = 509; // 510 counts, starts at 0, not 1!
  pTCD->CMPACLR  = 509;
  pTCD->CTRLC    = 0x80; // WOD outputs PWM B, WOC outputs PWM A
  pTCD->CTRLB    = TCD_WGMODE_ONERAMP_gc; // One Slope
  pTCD->CTRLA    = TIMERD0_PRESCALER; // OSC20M prescaled by 32, gives ~1.2 khz PWM at 20MHz.
}
#endif

void __attribute__((weak)) init_TCA0() {
  /*  TYPE A TIMER   */
  #if !defined(TCA_BUFFERED_3PIN)
    #if defined(PORTMUX_CTRLC)
      PORTMUX.CTRLC = TCA_PORTMUX;
    #else
      PORTMUX.TCAROUTEA = TCA_PORTMUX;
    #endif
    TCA0.SPLIT.CTRLD   = TCA_SPLIT_SPLITM_bm;
    TCA0.SPLIT.LPER    = PWM_TIMER_PERIOD;
    TCA0.SPLIT.HPER    = PWM_TIMER_PERIOD;
    TCA0.SPLIT.CTRLA   = (TIMERA_PRESCALER_bm | TCA_SPLIT_ENABLE_bm);
  #else
    #if defined(PORTMUX_CTRLC)
      PORTMUX.CTRLC = TCA_PORTMUX;
    #else
      PORTMUX.TCAROUTEA = TCA_PORTMUX;
    #endif
    TCA0.SINGLE.PER    = PWM_TIMER_PERIOD;
    TCA0.SINGLE.CTRLB  = TCA_SINGLE_WGMODE_SINGLESLOPE_gc;
    TCA0.SINGLE.CTRLA  = (TIMERA_PRESCALER_bm | TCA_SINGLE_ENABLE_bm);
  #endif

}


__attribute__((weak)) void onPreMain() {;}
__attribute__((weak)) void onBeforeInit() {;}
__attribute__((weak)) uint8_t onAfterInit() {return 1;}