- pop YH ;2 correct stack alignment
- nop2 ;2 delay for the same time as the pushes in the original code
- rjmp shortcutEntry ;2
-
-; ################# receiver loop #################
-; extra jobs done during bit interval:
-; bit 6: se0 check
-; bit 7: or, store, clear
-; bit 0: recover from delay [SE0 is unreliable here due to bit dribbling in hubs]
-; bit 1: se0 check
-; bit 2: se0 check
-; bit 3: overflow check
-; bit 4: se0 check
-; bit 5: rjmp
-
-; stuffed* helpers have the functionality of a subroutine, but we can't afford
-; the overhead of a call. We therefore need a separate routine for each caller
-; which jumps back appropriately.
-
-stuffed5: ;1 for branch taken
- in x2, USBIN ;1 <-- sample @ +1
- andi x2, USBMASK ;1
- breq se0a ;1
- andi x3, 0xc0 ;1 (0xff03 >> 2) & 0xff
- ori shift, 0xfc ;1
- rjmp rxbit6 ;2
-
-stuffed6: ;1 for branch taken
- in x1, USBIN ;1 <-- sample @ +1
- andi x1, USBMASK ;1
- breq se0a ;1
- andi x3, 0x81 ;1 (0xff03 >> 1) & 0xff
- ori shift, 0xfc ;1
- rjmp rxbit7 ;2
-
-; This is somewhat special because it has to compensate for the delay in bit 7
-stuffed7: ;1 for branch taken
- andi x1, USBMASK ;1 already sampled by caller
- breq se0a ;1
- mov x2, x1 ;1 ensure correct NRZI sequence [we can save andi x3 here]
- ori shift, 0xfc ;1
- in x1, USBIN ;1 <-- sample bit 0
- rjmp unstuffed7 ;2
-
-stuffed0: ;1 for branch taken
- in x1, USBIN ;1 <-- sample @ +1
- andi x1, USBMASK ;1
- breq se0a ;1
- andi x3, 0xfe ;1 (0xff03 >> 7) & 0xff
- ori shift, 0xfc ;1
- rjmp rxbit1 ;2
-
-;-----------------------------
-rxLoop:
- brlo stuffed5 ;1
-rxbit6:
- in x1, USBIN ;1 <-- sample bit 6
- andi x1, USBMASK ;1
- breq se0a ;1
- eor x2, x1 ;1
- ror x2 ;1
- ror shift ;1
- cpi shift, 4 ;1
- brlo stuffed6 ;1
-rxbit7:
- in x2, USBIN ;1 <-- sample bit 7
- eor x1, x2 ;1
- ror x1 ;1
- ror shift ;1
- eor x3, shift ;1 x3 is 0 at bit locations we changed, 1 at others
- st y+, x3 ;2 the eor above reconstructed modified bits and inverted rx data
- ser x3 ;1
-rxbit0:
- in x1, USBIN ;1 <-- sample bit 0
- cpi shift, 4 ;1
- brlo stuffed7 ;1
-unstuffed7:
- eor x2, x1 ;1
- ror x2 ;1
- ror shift ;1
- cpi shift, 4 ;1
- brlo stuffed0 ;1
-rxbit1:
- in x2, USBIN ;1 <-- sample bit 1
- andi x2, USBMASK ;1
-se0a: ; enlarge jump range to SE0
- breq se0 ;1 check for SE0 more often close to start of byte
- eor x1, x2 ;1
- ror x1 ;1
- ror shift ;1
- cpi shift, 4 ;1
- brlo stuffed1 ;1
-rxbit2:
- in x1, USBIN ;1 <-- sample bit 2
- andi x1, USBMASK ;1
- breq se0 ;1
- eor x2, x1 ;1
- ror x2 ;1
- ror shift ;1
- cpi shift, 4 ;1
- brlo stuffed2 ;1
-rxbit3:
- in x2, USBIN ;1 <-- sample bit 3
- eor x1, x2 ;1
- ror x1 ;1
- ror shift ;1
- dec cnt ;1 check for buffer overflow
- breq overflow ;1
- cpi shift, 4 ;1
- brlo stuffed3 ;1
-rxbit4:
- in x1, USBIN ;1 <-- sample bit 4
- andi x1, USBMASK ;1
- breq se0 ;1
- eor x2, x1 ;1
- ror x2 ;1
- ror shift ;1
- cpi shift, 4 ;1
- brlo stuffed4 ;1
-rxbit5:
- in x2, USBIN ;1 <-- sample bit 5
- eor x1, x2 ;1
- ror x1 ;1
- ror shift ;1
- cpi shift, 4 ;1
- rjmp rxLoop ;2
-;-----------------------------
-
-stuffed1: ;1 for branch taken
- in x2, USBIN ;1 <-- sample @ +1
- andi x2, USBMASK ;1
- breq se0 ;1
- andi x3, 0xfc ;1 (0xff03 >> 6) & 0xff
- ori shift, 0xfc ;1
- rjmp rxbit2 ;2
-
-stuffed2: ;1 for branch taken
- in x1, USBIN ;1 <-- sample @ +1
- andi x1, USBMASK ;1
- breq se0 ;1
- andi x3, 0xf8 ;1 (0xff03 >> 5) & 0xff
- ori shift, 0xfc ;1
- rjmp rxbit3 ;2
-
-stuffed3: ;1 for branch taken
- in x2, USBIN ;1 <-- sample @ +1
- andi x2, USBMASK ;1
- breq se0 ;1
- andi x3, 0xf0 ;1 (0xff03 >> 4) & 0xff
- ori shift, 0xfc ;1
- rjmp rxbit4 ;2
-
-stuffed4: ;1 for branch taken
- in x1, USBIN ;1 <-- sample @ +1
- andi x1, USBMASK ;1
- breq se0 ;1
- andi x3, 0xe0 ;1 (0xff03 >> 3) & 0xff
- ori shift, 0xfc ;1
- rjmp rxbit5 ;2
-
-;################ end receiver loop ###############
-
-overflow: ; ignore package if buffer overflow
- rjmp rxDoReturn ; enlarge jump range
-
-;This is the only non-error exit point for the software receiver loop
-;{4, 20} cycles after start of SE0, typically {10, 18} after SE0 start = {-6, 2} from end of SE0
-;next sync starts {16,} cycles after SE0 -> worst case start: +4 from next sync start
-;we don't check any CRCs here because there is no time left.
-se0: ;{-6, 2} from end of SE0 / {,4} into next frame
- mov cnt, YL ;1 assume buffer in lower 256 bytes of memory
- lds YL, usbInputBuf ;2 reposition to buffer start
- sub cnt, YL ;1 length of message
- ldi x1, 1<<USB_INTR_PENDING_BIT ;1
- cpi cnt, 3 ;1
- out USB_INTR_PENDING, x1;1 clear pending intr and check flag later. SE0 must be over. {,10} into next frame
- brlo rxDoReturn ;1 ensure valid packet size, ignore others
- ld x1, y ;2 PID
- ldd x2, y+1 ;2 ADDR + 1 bit endpoint number
- mov x3, x2 ;1 store for endpoint number
- andi x2, 0x7f ;1 mask endpoint number bit
- lds shift, usbDeviceId ;2
- cpi x1, USBPID_SETUP ;1
- breq isSetupOrOut ;2 -> 19 = {13, 21} from SE0 end
- cpi x1, USBPID_OUT ;1
- breq isSetupOrOut ;2 -> 22 = {16, 24} from SE0 end / {,24} into next frame
- cpi x1, USBPID_IN ;1
- breq handleIn ;1
- cpi x1, USBPID_DATA0 ;1
- breq isData ;1
- cpi x1, USBPID_DATA1 ;1
- brne rxDoReturn ;1 ignore all other PIDs
-isData:
- lds x2, usbCurrentTok ;2
- tst x2 ;1
- breq rxDoReturn ;1 for other device or spontaneous data -- ignore
- lds x1, usbRxLen ;2
- cpi x1, 0 ;1
- brne sendNakAndReti ;1 no buffer space available / {30, 38} from SE0 end
- sts usbRxLen, cnt ;2 store received data, swap buffers
- sts usbRxToken, x2 ;2
- lds x1, usbAppBuf ;2
- sts usbAppBuf, YL ;2
- sts usbInputBuf, x1 ;2 buffers now swapped
- rjmp sendAckAndReti ;2 -> {42, 50} from SE0 end
-
-handleIn: ; {18, 26} from SE0 end
- cp x2, shift ;1 shift contains our device ID
- brne rxDoReturn ;1 other device
-#if USB_CFG_HAVE_INTRIN_ENDPOINT
- sbrc x3, 7 ;2
- rjmp handleIn1 ;0
+
+#if USB_USE_FAST_CRC
+
+; This implementation is faster, but has bigger code size
+; Thanks to Slawomir Fras (BoskiDialer) for this code and to Shay Green for
+; even further optimizations!
+; It implements the following C pseudo-code:
+; unsigned table(unsigned char x)
+; {
+; unsigned value;
+;
+; value = (unsigned)x << 6;
+; value ^= (unsigned)x << 7;
+; if(parity(x))
+; value ^= 0xc001;
+; return value;
+; }
+; unsigned usbCrc16(unsigned char *argPtr, unsigned char argLen)
+; {
+; unsigned crc = 0xffff;
+;
+; while(argLen--)
+; crc = table(lo8(crc) ^ *argPtr++) ^ hi8(crc);
+; return ~crc;
+; }
+
+; extern unsigned usbCrc16(unsigned char *argPtr, unsigned char argLen);
+; argPtr r24+25 / r16+r17
+; argLen r22 / r18
+; temp variables:
+; byte r18 / r22
+; scratch r23
+; resCrc r24+r25 / r16+r17
+; ptr X / Z
+usbCrc16:
+ movw ptrL, argPtrL
+ ldi resCrcL, 0xFF
+ ldi resCrcH, 0xFF
+ clr bitCnt ; zero reg
+ rjmp usbCrc16LoopTest
+usbCrc16ByteLoop:
+ ld byte, ptr+
+ eor byte, resCrcL ; scratch is now 'x' in table()
+ mov scratch, byte ; compute parity of 'x'
+ swap byte
+ eor byte, scratch
+ mov resCrcL, byte
+ lsr byte
+ lsr byte
+ eor byte, resCrcL
+ inc byte
+ andi byte, 2 ; byte is now parity(x) << 1
+ cp bitCnt, byte ; c = (byte != 0), then put in high bit
+ ror scratch ; so that after xoring, shifting, and xoring, it gives
+ ror byte ; the desired 0xC0 with resCrcH
+ mov resCrcL, byte
+ eor resCrcL, resCrcH
+ mov resCrcH, scratch
+ lsr scratch
+ ror byte
+ eor resCrcH, scratch
+ eor resCrcL, byte
+usbCrc16LoopTest:
+ subi argLen, 1
+ brsh usbCrc16ByteLoop
+ com resCrcL
+ com resCrcH
+ ret
+
+#else /* USB_USE_FAST_CRC */
+
+; This implementation is slower, but has less code size
+;
+; extern unsigned usbCrc16(unsigned char *argPtr, unsigned char argLen);
+; argPtr r24+25 / r16+r17
+; argLen r22 / r18
+; temp variables:
+; byte r18 / r22
+; bitCnt r19
+; poly r20+r21
+; scratch r23
+; resCrc r24+r25 / r16+r17
+; ptr X / Z
+usbCrc16:
+ mov ptrL, argPtrL
+ mov ptrH, argPtrH
+ ldi resCrcL, 0
+ ldi resCrcH, 0
+ ldi polyL, lo8(0xa001)
+ ldi polyH, hi8(0xa001)
+ com argLen ; argLen = -argLen - 1: modified loop to ensure that carry is set
+ ldi bitCnt, 0 ; loop counter with starnd condition = end condition
+ rjmp usbCrcLoopEntry
+usbCrcByteLoop:
+ ld byte, ptr+
+ eor resCrcL, byte
+usbCrcBitLoop:
+ ror resCrcH ; carry is always set here (see brcs jumps to here)
+ ror resCrcL
+ brcs usbCrcNoXor
+ eor resCrcL, polyL
+ eor resCrcH, polyH
+usbCrcNoXor:
+ subi bitCnt, 224 ; (8 * 224) % 256 = 0; this loop iterates 8 times
+ brcs usbCrcBitLoop
+usbCrcLoopEntry:
+ subi argLen, -1
+ brcs usbCrcByteLoop
+usbCrcReady:
+ ret
+; Thanks to Reimar Doeffinger for optimizing this CRC routine!
+
+#endif /* USB_USE_FAST_CRC */
+
+; extern unsigned usbCrc16Append(unsigned char *data, unsigned char len);
+usbCrc16Append:
+ rcall usbCrc16
+ st ptr+, resCrcL
+ st ptr+, resCrcH
+ ret
+
+#undef argLen
+#undef argPtrL
+#undef argPtrH
+#undef resCrcL
+#undef resCrcH
+#undef ptrL
+#undef ptrH
+#undef ptr
+#undef byte
+#undef bitCnt
+#undef polyL
+#undef polyH
+#undef scratch
+
+
+#if USB_CFG_HAVE_MEASURE_FRAME_LENGTH
+#ifdef __IAR_SYSTEMS_ASM__
+/* Register assignments for usbMeasureFrameLength on IAR cc */
+/* Calling conventions on IAR:
+ * First parameter passed in r16/r17, second in r18/r19 and so on.
+ * Callee must preserve r4-r15, r24-r29 (r28/r29 is frame pointer)
+ * Result is passed in r16/r17
+ * In case of the "tiny" memory model, pointers are only 8 bit with no
+ * padding. We therefore pass argument 1 as "16 bit unsigned".
+ */
+# define resL r16
+# define resH r17
+# define cnt16L r30
+# define cnt16H r31
+# define cntH r18
+
+#else /* __IAR_SYSTEMS_ASM__ */
+/* Register assignments for usbMeasureFrameLength on gcc */
+/* Calling conventions on gcc:
+ * First parameter passed in r24/r25, second in r22/23 and so on.
+ * Callee must preserve r1-r17, r28/r29
+ * Result is passed in r24/r25
+ */
+# define resL r24
+# define resH r25
+# define cnt16L r24
+# define cnt16H r25
+# define cntH r26