|
| 1 | +<!-- |
| 2 | + Architecture with no fracturable LUTs |
| 3 | +
|
| 4 | + - 40 nm technology |
| 5 | + - General purpose logic block: |
| 6 | + K = 6, N = 10 |
| 7 | + - Routing architecture: L = 4, fc_in = 0.15, fc_out = 0.15 |
| 8 | + - Unidirectional (mux-based) routing |
| 9 | +
|
| 10 | +
|
| 11 | + Details on Modelling: |
| 12 | +
|
| 13 | + Based on flagship k6_frac_N10_mem32K_40nm.xml architecture. This architecture has no fracturable LUTs nor any heterogeneous blocks. |
| 14 | + The delays and areas are based on a mix of values from commercial 40 nm |
| 15 | + FPGAs with a comparable architecture and 40 nm interconnect and |
| 16 | + transistor models. |
| 17 | +
|
| 18 | + Authors: Jason Luu, Jeff Goeders, Vaughn Betz |
| 19 | +--> |
| 20 | +<architecture> |
| 21 | + <!-- |
| 22 | + ODIN II specific config begins |
| 23 | + This part of the architecture file describes the "primitives" |
| 24 | + that exist in a device to the synthesis tool used to "elaborate" |
| 25 | + verilog into these primitives (which is called ODIN-II). |
| 26 | + Basic LUTs, I/Os and FFs are built into the language used by this |
| 27 | + flow (blif keywords .names, .input, .output and .latch), so they |
| 28 | + don't have to be described here. |
| 29 | + |
| 30 | + For this lab you are also given the benchmark netlists after |
| 31 | + synthesis is complete (in the blif directory), so you don't need |
| 32 | + to run ODIN II. |
| 33 | + --> |
| 34 | + <models> |
| 35 | + </models> |
| 36 | + <!-- ODIN II specific config ends --> |
| 37 | + |
| 38 | + <!-- Descritions of the physical tiles that exist on the die begins --> |
| 39 | + <tiles> |
| 40 | + <tile name="io" area="0"> |
| 41 | + <sub_tile name="io" capacity="8"> |
| 42 | + <equivalent_sites> |
| 43 | + <site pb_type="io" pin_mapping="direct"/> |
| 44 | + </equivalent_sites> |
| 45 | + <input name="outpad" num_pins="1"/> |
| 46 | + <output name="inpad" num_pins="1"/> |
| 47 | + <clock name="clock" num_pins="1"/> |
| 48 | + <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.15"/> |
| 49 | + <!-- IOs go on the periphery of the FPGA in this |
| 50 | + architecture. Since I don't want to define four |
| 51 | + different physical I/Os for the left, right, top, |
| 52 | + and bottom sides just say each pin of the I/O |
| 53 | + block is accessible from all four sides so we can |
| 54 | + reach routing channels on some side of the block |
| 55 | + no matter which side of the chip we're on. |
| 56 | + --> |
| 57 | + <pinlocations pattern="custom"> |
| 58 | + <loc side="left">io.outpad io.inpad io.clock</loc> |
| 59 | + <loc side="top">io.outpad io.inpad io.clock</loc> |
| 60 | + <loc side="right">io.outpad io.inpad io.clock</loc> |
| 61 | + <loc side="bottom">io.outpad io.inpad io.clock</loc> |
| 62 | + </pinlocations> |
| 63 | + </sub_tile> |
| 64 | + </tile> |
| 65 | + |
| 66 | + <!-- Define general purpose logic block (CLB) begin --> |
| 67 | + <!-- Area below is for everything inside the |
| 68 | + logic block (LUTs, FFs, intra-cluster |
| 69 | + routing). It's a bit on the low side given the large crossbars in this |
| 70 | + architecture - more appropriate for a lower-cost |
| 71 | + FPGA with smaller transistors and narrower metal. |
| 72 | + --> |
| 73 | + <tile name="clb" area="18000"> |
| 74 | + <!-- We can place a clustered block of type clb on a tile location |
| 75 | + of type clb. |
| 76 | + --> |
| 77 | + <sub_tile name="clb"> |
| 78 | + <equivalent_sites> |
| 79 | + <site pb_type="clb" pin_mapping="direct"/> |
| 80 | + </equivalent_sites> |
| 81 | + |
| 82 | + <!-- We have a full crossbar between the cluster inputs and the |
| 83 | + LUT inputs, so the router can route to *any* input or from |
| 84 | + *any* output on the logic block. Hence mark the logic block |
| 85 | + inputs as fully logically equivalent (swappable by the router) and also the |
| 86 | + logic block outputs as logically equivalent, which means |
| 87 | + they can also be swapped by the router. |
| 88 | + --> |
| 89 | + |
| 90 | + <input name="I" num_pins="40" equivalent="full"/> |
| 91 | + <output name="O" num_pins="10" equivalent="instance"/> |
| 92 | + <clock name="clk" num_pins="1"/> |
| 93 | + <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.15"/> |
| 94 | + <pinlocations pattern="spread"/> |
| 95 | + </sub_tile> |
| 96 | + </tile> |
| 97 | + </tiles> |
| 98 | + <!-- Physical tile descriptions end --> |
| 99 | + |
| 100 | + <!-- Chip layout (in terms of where tiles are) begins --> |
| 101 | + <layout> |
| 102 | + <auto_layout aspect_ratio="1.0"> |
| 103 | + <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners--> |
| 104 | + <perimeter type="io" priority="100"/> |
| 105 | + <corners type="EMPTY" priority="101"/> |
| 106 | + <!--Fill with 'clb'--> |
| 107 | + <fill type="clb" priority="10"/> |
| 108 | + </auto_layout> |
| 109 | + |
| 110 | + <fixed_layout name="vtr_homogeneous_extra_small" width="10" height="10"> |
| 111 | + <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners--> |
| 112 | + <perimeter type="io" priority="100"/> |
| 113 | + <corners type="EMPTY" priority="101"/> |
| 114 | + <!--Fill with 'clb'--> |
| 115 | + <fill type="clb" priority="10"/> |
| 116 | + |
| 117 | + <interposer_cut dim="x" loc="5"> |
| 118 | + <interdie_wire sg_name="interposer_sg" sg_link="L_UP" offset_start="0" offset_end="0" offset_increment="0" num="30"/> |
| 119 | + <interdie_wire sg_name="interposer_sg" sg_link="L_DOWN" offset_start="0" offset_end="0" offset_increment="0" num="30"/> |
| 120 | + </interposer_cut> |
| 121 | + </fixed_layout> |
| 122 | + </layout> |
| 123 | + <!-- Chip layout ends --> |
| 124 | + |
| 125 | + <!-- Electrical and inter-cluster (general) routing description begins --> |
| 126 | + <device> |
| 127 | + <!-- Some area and timing parameters --> |
| 128 | + <sizing R_minW_nmos="8926" R_minW_pmos="16067"/> |
| 129 | + <!-- The grid_logic_tile_area below will be used for all blocks that do not explicitly set their own (non-routing) |
| 130 | + area; set to 0 since we explicitly set the area of all blocks currently in this architecture file. |
| 131 | + --> |
| 132 | + <area grid_logic_tile_area="0"/> |
| 133 | + <chan_width_distr> |
| 134 | + <x distr="uniform" peak="1.000000"/> |
| 135 | + <y distr="uniform" peak="1.000000"/> |
| 136 | + </chan_width_distr> |
| 137 | + |
| 138 | + <!-- Define the switch block pattern (pattern of switches between inter-tile routing wires) |
| 139 | + The Wilton switch block is a sample pattern; you can use custom switch blocks for more control --> |
| 140 | + <switch_block type="wilton" fs="3"/> |
| 141 | + |
| 142 | + <!-- Set which switch to use for input connection blocks. Only affects timing and area, not connectivity --> |
| 143 | + <connection_block input_switch_name="ipin_cblock"/> |
| 144 | + </device> |
| 145 | + <switchlist> |
| 146 | + <!-- VB: the mux_trans_size and buf_size data below is in minimum width transistor *areas*, assuming the purple |
| 147 | + book area formula. This means the mux transistors are about 5x minimum drive strength. |
| 148 | + We assume the first stage of the buffer is 3x min drive strength to be reasonable given the large |
| 149 | + mux transistors, and this gives a reasonable stage ratio of a bit over 5x to the second stage. |
| 150 | + --> |
| 151 | + <switch type="mux" name="0" R="551" Cin=".77e-15" Cout="4e-15" Tdel="58e-12" mux_trans_size="2.630740" buf_size="27.645901"/> |
| 152 | + <!--switch ipin_cblock resistance set to yeild for 4x minimum drive strength buffer--> |
| 153 | + <switch type="mux" name="ipin_cblock" R="2231.5" Cout="0." Cin="1.47e-15" Tdel="7.247000e-11" mux_trans_size="1.222260" buf_size="auto"/> |
| 154 | + </switchlist> |
| 155 | + <segmentlist> |
| 156 | + <!--- VB & JL: using ITRS metal stack data, 96 nm half pitch wires, which are intermediate metal width/space. |
| 157 | + Wires of this pitch will fit over a 90 nm |
| 158 | + high logic tile (which is about the height of a Stratix IV logic tile). |
| 159 | + I'm using a tile length of 90 nm, corresponding to the length of a Stratix IV tile if it were square. |
| 160 | + length below is in units of logic blocks, and Rmetal and Cmetal are |
| 161 | + per logic block passed, so wire delay adapts automatically if you change the |
| 162 | + length=? value. --> |
| 163 | + |
| 164 | + <!-- Currently only one type of routing wire, which |
| 165 | + is of length 4 and has switches to every connection |
| 166 | + box (4 of them) and switch box (5 of them) |
| 167 | + it passes. You can change wirelengths just by changing the length="?" values |
| 168 | + and changing the number of 1's (or 0's) in the <sb type and <cb type lines to |
| 169 | + match the number of switch blocks and connection blocks a wire of that length |
| 170 | + would span. --> |
| 171 | + <segment freq="1.000000" length="4" type="unidir" Rmetal="101" Cmetal="22.5e-15"> |
| 172 | + <mux name="0"/> |
| 173 | + <sb type="pattern">1 1 1 1 1</sb> |
| 174 | + <cb type="pattern">1 1 1 1</cb> |
| 175 | + </segment> |
| 176 | + </segmentlist> |
| 177 | + <!-- Electrical and inter-cluster routing description ends --> |
| 178 | + |
| 179 | + <!-- Description of the capabilities (number of BLEs, modes) and local interconnect in |
| 180 | + each type of complex (clustered) block (e.g. LBs) begins |
| 181 | + --> |
| 182 | + <complexblocklist> |
| 183 | + <!-- Define I/O pads begin --> |
| 184 | + <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. --> |
| 185 | + <pb_type name="io"> |
| 186 | + <input name="outpad" num_pins="1"/> |
| 187 | + <output name="inpad" num_pins="1"/> |
| 188 | + <clock name="clock" num_pins="1"/> |
| 189 | + <!-- IOs can operate as either inputs or outputs. |
| 190 | + The delays below are to and from registers in the I/O (and generally I/Os are registered |
| 191 | + today). |
| 192 | + --> |
| 193 | + <mode name="inpad"> |
| 194 | + <pb_type name="inpad" blif_model=".input" num_pb="1"> |
| 195 | + <output name="inpad" num_pins="1"/> |
| 196 | + </pb_type> |
| 197 | + <interconnect> |
| 198 | + <direct name="inpad" input="inpad.inpad" output="io.inpad"> |
| 199 | + <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/> |
| 200 | + </direct> |
| 201 | + </interconnect> |
| 202 | + </mode> |
| 203 | + <mode name="outpad"> |
| 204 | + <pb_type name="outpad" blif_model=".output" num_pb="1"> |
| 205 | + <input name="outpad" num_pins="1"/> |
| 206 | + </pb_type> |
| 207 | + <interconnect> |
| 208 | + <direct name="outpad" input="io.outpad" output="outpad.outpad"> |
| 209 | + <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/> |
| 210 | + </direct> |
| 211 | + </interconnect> |
| 212 | + </mode> |
| 213 | + |
| 214 | + <!-- Not modeling I/O power for now --> |
| 215 | + <power method="ignore"/> |
| 216 | + </pb_type> |
| 217 | + <!-- Define I/O pads ends --> |
| 218 | + |
| 219 | + <!-- Define general purpose logic block (CLB) begin --> |
| 220 | + <!-- Area below is for everything inside the |
| 221 | + logic block (LUTs, FFs, intra-cluster |
| 222 | + routing). |
| 223 | + --> |
| 224 | + <pb_type name="clb"> |
| 225 | + <input name="I" num_pins="40" equivalent="full"/> |
| 226 | + <output name="O" num_pins="10" equivalent="instance"/> |
| 227 | + <clock name="clk" num_pins="1"/> |
| 228 | + <!-- Describe basic logic element. |
| 229 | + Each basic logic element has a 6-LUT that can be optionally registered |
| 230 | + --> |
| 231 | + <pb_type name="fle" num_pb="10"> |
| 232 | + <input name="in" num_pins="6"/> |
| 233 | + <output name="out" num_pins="1"/> |
| 234 | + <clock name="clk" num_pins="1"/> |
| 235 | + <!-- 6-LUT mode definition begin --> |
| 236 | + <mode name="n1_lut6"> |
| 237 | + <!-- Define 6-LUT mode --> |
| 238 | + <pb_type name="ble6" num_pb="1"> |
| 239 | + <input name="in" num_pins="6"/> |
| 240 | + <output name="out" num_pins="1"/> |
| 241 | + <clock name="clk" num_pins="1"/> |
| 242 | + <!-- Define LUT --> |
| 243 | + <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut"> |
| 244 | + <input name="in" num_pins="6" port_class="lut_in"/> |
| 245 | + <output name="out" num_pins="1" port_class="lut_out"/> |
| 246 | + <!-- LUT timing using delay matrix --> |
| 247 | + <!-- These are the delay per LUT input on a Stratix IV LUT. |
| 248 | + The average is 261 ps, and inputs earlier in the mux tree are slower. |
| 249 | + --> |
| 250 | + <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out"> |
| 251 | + 82e-12 |
| 252 | + 173e-12 |
| 253 | + 261e-12 |
| 254 | + 263e-12 |
| 255 | + 398e-12 |
| 256 | + 397e-12 |
| 257 | + </delay_matrix> |
| 258 | + </pb_type> |
| 259 | + <!-- Define flip-flop --> |
| 260 | + <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop"> |
| 261 | + <input name="D" num_pins="1" port_class="D"/> |
| 262 | + <output name="Q" num_pins="1" port_class="Q"/> |
| 263 | + <clock name="clk" num_pins="1" port_class="clock"/> |
| 264 | + <T_setup value="66e-12" port="ff.D" clock="clk"/> |
| 265 | + <T_clock_to_Q max="124e-12" port="ff.Q" clock="clk"/> |
| 266 | + </pb_type> |
| 267 | + |
| 268 | + <!-- many lines below to describe the interconnect |
| 269 | + wires, muxes and crossbars inside a cluster. |
| 270 | + --> |
| 271 | + <interconnect> |
| 272 | + <direct name="direct1" input="ble6.in" output="lut6[0:0].in"/> |
| 273 | + <direct name="direct2" input="lut6.out" output="ff.D"> |
| 274 | + <!-- Advanced user option that tells CAD tool to find LUT+FF pairs in netlist --> |
| 275 | + <pack_pattern name="ble6" in_port="lut6.out" out_port="ff.D"/> |
| 276 | + </direct> |
| 277 | + <direct name="direct3" input="ble6.clk" output="ff.clk"/> |
| 278 | + <mux name="mux1" input="ff.Q lut6.out" output="ble6.out"> |
| 279 | + <!-- LUT to output is faster than FF to output on a Stratix IV --> |
| 280 | + <delay_constant max="25e-12" in_port="lut6.out" out_port="ble6.out"/> |
| 281 | + <delay_constant max="45e-12" in_port="ff.Q" out_port="ble6.out"/> |
| 282 | + </mux> |
| 283 | + </interconnect> |
| 284 | + </pb_type> |
| 285 | + <interconnect> |
| 286 | + <direct name="direct1" input="fle.in" output="ble6.in"/> |
| 287 | + <direct name="direct2" input="ble6.out" output="fle.out[0:0]"/> |
| 288 | + <direct name="direct3" input="fle.clk" output="ble6.clk"/> |
| 289 | + </interconnect> |
| 290 | + </mode> |
| 291 | + <!-- 6-LUT mode definition end --> |
| 292 | + </pb_type> |
| 293 | + <interconnect> |
| 294 | + <!-- We use a full crossbar to get logical equivalence at inputs of CLB |
| 295 | + The delays below come from Stratix IV. the delay through a connection block |
| 296 | + input mux + the crossbar in Stratix IV is 167 ps. We already have a 72 ps |
| 297 | + delay on the connection block input mux (modeled by Ian Kuon), so the remaining |
| 298 | + delay within the crossbar is 95 ps. |
| 299 | + The delays of cluster feedbacks in Stratix IV is 100 ps, when driven by a LUT. |
| 300 | + Since all our outputs LUT outputs go to a BLE output, and have a delay of |
| 301 | + 25 ps to do so, we subtract 25 ps from the 100 ps delay of a feedback |
| 302 | + to get the part that should be marked on the crossbar. --> |
| 303 | + <complete name="crossbar" input="clb.I fle[9:0].out" output="fle[9:0].in"> |
| 304 | + <delay_constant max="95e-12" in_port="clb.I" out_port="fle[9:0].in"/> |
| 305 | + <delay_constant max="75e-12" in_port="fle[9:0].out" out_port="fle[9:0].in"/> |
| 306 | + </complete> |
| 307 | + <complete name="clks" input="clb.clk" output="fle[9:0].clk"> |
| 308 | + </complete> |
| 309 | + |
| 310 | + <!-- The BLE outputs are directly connected to the |
| 311 | + CLB (cluster) outputs. |
| 312 | + --> |
| 313 | + <direct name="clbouts1" input="fle[9:0].out" output="clb.O"/> |
| 314 | + </interconnect> |
| 315 | + </pb_type> |
| 316 | + <!-- Define general purpose logic block (CLB) ends --> |
| 317 | + </complexblocklist> |
| 318 | + <power> |
| 319 | + <local_interconnect C_wire="2.5e-10"/> |
| 320 | + <mux_transistor_size mux_transistor_size="3"/> |
| 321 | + <FF_size FF_size="4"/> |
| 322 | + <LUT_transistor_size LUT_transistor_size="4"/> |
| 323 | + </power> |
| 324 | + <clocks> |
| 325 | + <clock buffer_size="auto" C_wire="2.5e-10"/> |
| 326 | + </clocks> |
| 327 | + |
| 328 | + <scatter_gather_list> |
| 329 | + <sg_pattern name="interposer_sg" type="unidir"> |
| 330 | + <gather> |
| 331 | + <wireconn num_conns="30" from_type="wire" from_switchpoint="1" side="rltb"/> |
| 332 | + </gather> |
| 333 | + |
| 334 | + <scatter> |
| 335 | + <wireconn num_conns="30" to_type="wire" to_switchpoint="0" side="rtl"/> |
| 336 | + </scatter> |
| 337 | + |
| 338 | + <sg_link_list> |
| 339 | + <sg_link name="L_UP" y_offset="1" mux="sw" seg_type="wire"/> |
| 340 | + <sg_link name="L_DOWN" y_offset="-1" mux="sw" seg_type="wire"/> |
| 341 | + </sg_link_list> |
| 342 | + </sg_pattern> |
| 343 | + </scatter_gather_list> |
| 344 | +</architecture> |
0 commit comments